ayousanz commited on
Commit
4fb706e
·
verified ·
1 Parent(s): e55f565

Upload folder using huggingface_hub

Browse files
Files changed (29) hide show
  1. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1.log +14 -14
  2. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml +1 -1
  3. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2.log +14 -14
  4. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml +1 -1
  5. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3.log +14 -14
  6. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml +1 -1
  7. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4.log +14 -14
  8. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml +1 -1
  9. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5.log +14 -14
  10. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml +1 -1
  11. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6.log +14 -14
  12. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml +1 -1
  13. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7.log +14 -14
  14. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml +1 -1
  15. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8.log +14 -14
  16. exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml +1 -1
  17. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml +3 -3
  18. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091035.92b100c97f43.1159464.0 +2 -2
  19. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091448.92b100c97f43.1179446.0 +3 -0
  20. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091666.92b100c97f43.1289026.0 +3 -0
  21. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091743.92b100c97f43.1324139.0 +3 -0
  22. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091448.92b100c97f43.1179446.1 +3 -0
  23. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091666.92b100c97f43.1289026.1 +3 -0
  24. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091743.92b100c97f43.1324139.1 +3 -0
  25. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.1.log +1342 -0
  26. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.2.log +0 -0
  27. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.3.log +1247 -0
  28. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.4.log +1212 -0
  29. exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.log +6 -978
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,215 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,440 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,563 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,676 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fea878fb160>
858
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fea9681f280>
869
- [92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,705 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,621 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,843 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,966 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,174 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f8714e3e1c0>
858
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f8723d63280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,185 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,204 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,163 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,507 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,643 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,653 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f31cda9c160>
858
- [92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f31dc9c2280>
869
- [92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,672 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,625 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,848 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,971 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,094 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,107 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,107 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,107 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f91b6d1e160>
858
+ [92b100c97f43] 2025-03-04 22:09:41,107 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,107 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f91c5f45280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,108 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,127 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,166 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,388 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,512 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,615 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,624 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f725c8cd0a0>
858
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f7278a78280>
869
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,643 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,630 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,851 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,974 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,143 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,153 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,154 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,154 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f44e9149100>
858
+ [92b100c97f43] 2025-03-04 22:09:41,154 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,154 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f44f806f190>
869
+ [92b100c97f43] 2025-03-04 22:09:41,154 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,173 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,214 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,437 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,561 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,634 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f24e600b100>
858
- [92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f24f4f30190>
869
- [92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,663 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,646 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,867 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,988 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,192 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fb42810e040>
858
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fb4442b9280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,203 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,222 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,507 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,547 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fd4c5ad51c0>
858
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fd4d49fa280>
869
- [92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,577 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,637 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,858 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,979 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,208 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f900e5431c0>
858
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f901d469280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,219 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,238 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,383 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,505 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,676 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f678ea391c0>
858
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f679d95f280>
869
- [92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,706 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,622 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,844 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,966 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,158 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f3c0ac0f160>
858
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f3c19b35280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,169 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,188 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,508 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,651 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f650237d1c0>
858
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f65112a3280>
869
- [92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,680 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,619 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,843 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,967 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,131 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fd21d2ef1c0>
858
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fd22c214280>
869
+ [92b100c97f43] 2025-03-04 22:09:41,142 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,161 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8.log CHANGED
@@ -1,14 +1,14 @@
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
- # Started at Tue Mar 4 21:23:26 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
- [92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41
8
- [92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
- [92b100c97f43] 2025-03-04 21:23:29,508 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
- [92b100c97f43] 2025-03-04 21:23:31,596 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
- [92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
@@ -844,7 +844,7 @@ Model summary:
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
- [92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
@@ -854,8 +854,8 @@ Parameter Group 0
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
- [92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fe8295891c0>
858
- [92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
@@ -865,10 +865,10 @@ Parameter Group 0
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
- [92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fe8384af280>
869
- [92b100c97f43] 2025-03-04 21:23:31,607 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml
870
- [92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
- # Accounting: time=11 threads=1
874
- # Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds
 
1
  # python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
2
+ # Started at Tue Mar 4 22:09:35 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200
7
+ [92b100c97f43] 2025-03-04 22:09:38,624 (gan_tts:304) INFO: Vocabulary size: 41
8
+ [92b100c97f43] 2025-03-04 22:09:38,844 (encoder:172) INFO: encoder self-attention layer type = self-attention
9
+ [92b100c97f43] 2025-03-04 22:09:38,966 (encoder:172) INFO: encoder self-attention layer type = self-attention
10
+ [92b100c97f43] 2025-03-04 22:09:41,199 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
11
+ [92b100c97f43] 2025-03-04 22:09:41,209 (abs_task:1158) INFO: Model structure:
12
  ESPnetGANTTSModel(
13
  (feats_extract): LogMelFbank(
14
  (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
 
844
  Number of trainable parameters: 83.28 M (100.0%)
845
  Size: 333.11 MB
846
  Type: torch.float32
847
+ [92b100c97f43] 2025-03-04 22:09:41,209 (abs_task:1161) INFO: Optimizer:
848
  AdamW (
849
  Parameter Group 0
850
  amsgrad: False
 
854
  lr: 0.0002
855
  weight_decay: 0.0
856
  )
857
+ [92b100c97f43] 2025-03-04 22:09:41,209 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fe0420a1100>
858
+ [92b100c97f43] 2025-03-04 22:09:41,209 (abs_task:1161) INFO: Optimizer2:
859
  AdamW (
860
  Parameter Group 0
861
  amsgrad: False
 
865
  lr: 0.0002
866
  weight_decay: 0.0
867
  )
868
+ [92b100c97f43] 2025-03-04 22:09:41,209 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7fe050fc7190>
869
+ [92b100c97f43] 2025-03-04 22:09:41,210 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml
870
+ [92b100c97f43] 2025-03-04 22:09:41,228 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=32, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['<blank>', '<unk>', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', '<sos/eos>'], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False)
871
  /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
872
  olens = (ilens - self.n_fft) // self.hop_length + 1
873
+ # Accounting: time=15 threads=1
874
+ # Ended (code 0) at Tue Mar 4 22:09:50 JST 2025, elapsed time 15 seconds
exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8
7
  ngpu: 0
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
6
  output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8
7
  ngpu: 0
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml CHANGED
@@ -6,7 +6,7 @@ iterator_type: sequence
6
  output_dir: exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk
7
  ngpu: 1
8
  seed: 777
9
- num_workers: 16
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
@@ -14,7 +14,7 @@ dist_world_size: 4
14
  dist_rank: 0
15
  local_rank: 0
16
  dist_master_addr: localhost
17
- dist_master_port: 52975
18
  dist_launcher: null
19
  multiprocessing_distributed: true
20
  unused_parameters: true
@@ -70,7 +70,7 @@ freeze_param: []
70
  num_iters_per_epoch: 1000
71
  batch_size: 20
72
  valid_batch_size: null
73
- batch_bins: 6000000
74
  valid_batch_bins: null
75
  train_shape_file:
76
  - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn
 
6
  output_dir: exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk
7
  ngpu: 1
8
  seed: 777
9
+ num_workers: 32
10
  num_att_plot: 3
11
  dist_backend: nccl
12
  dist_init_method: env://
 
14
  dist_rank: 0
15
  local_rank: 0
16
  dist_master_addr: localhost
17
+ dist_master_port: 59597
18
  dist_launcher: null
19
  multiprocessing_distributed: true
20
  unused_parameters: true
 
70
  num_iters_per_epoch: 1000
71
  batch_size: 20
72
  valid_batch_size: null
73
+ batch_bins: 9000000
74
  valid_batch_bins: null
75
  train_shape_file:
76
  - exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091035.92b100c97f43.1159464.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a8704d80d583693bc56ad0906c5fa8f30dc6156977506f396db7d40a7c398bd
3
- size 4873
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47ca4c2af2ae5fe4ef0943d758d223149e37a2e101fc264aae7be3a9cdbb57bf
3
+ size 6486
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091448.92b100c97f43.1179446.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e59d9a3123a950064a58b9351b715c74d50f04dfc4bbbda3049c9e7366862a7
3
+ size 1674
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091666.92b100c97f43.1289026.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6afddefdf687f1aec9663925e8bb8d5127831a5a2e7b3d5070c1684734cc7f00
3
+ size 88
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091743.92b100c97f43.1324139.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:212fa57bbb135b7b0557550baa81d440f17fdefc764a4b170da42cc08b3ae061
3
+ size 88
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091448.92b100c97f43.1179446.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e27cca46138cc3b95825e29775f472797d893c3278e7e926ceaa550d9316db8
3
+ size 88
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091666.92b100c97f43.1289026.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7bec30596fbaf8184b6cb5fadfeccfba94d87111a618757906ebbabe7fed01b
3
+ size 88
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091743.92b100c97f43.1324139.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b469790a7a5ba7a680abdd1ebac1353bd03e16b487dda1629f526d68b1843e74
3
+ size 88
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.1.log ADDED
@@ -0,0 +1,1342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
2
+ # Started at Tue Mar 4 21:35:13 JST 2025
3
+ #
4
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
+ /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
7
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
8
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
9
+ [92b100c97f43:0/4] 2025-03-04 21:35:19,077 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
10
+ [92b100c97f43:0/4] 2025-03-04 21:35:19,088 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
11
+ [92b100c97f43:0/4] 2025-03-04 21:35:19,138 (gan_tts:304) INFO: Vocabulary size: 41
12
+ [92b100c97f43:0/4] 2025-03-04 21:35:19,267 (encoder:172) INFO: encoder self-attention layer type = self-attention
13
+ [92b100c97f43:0/4] 2025-03-04 21:35:19,488 (encoder:172) INFO: encoder self-attention layer type = self-attention
14
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,642 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
15
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,651 (abs_task:1158) INFO: Model structure:
16
+ ESPnetGANTTSModel(
17
+ (feats_extract): LogMelFbank(
18
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
19
+ (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False)
20
+ )
21
+ (normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz, norm_means=True, norm_vars=True)
22
+ (pitch_extract): Dio()
23
+ (pitch_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz, norm_means=True, norm_vars=True)
24
+ (energy_extract): Energy(
25
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
26
+ )
27
+ (energy_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz, norm_means=True, norm_vars=True)
28
+ (tts): JETS(
29
+ (generator): JETSGenerator(
30
+ (encoder): Encoder(
31
+ (embed): Sequential(
32
+ (0): Embedding(41, 256, padding_idx=0)
33
+ (1): ScaledPositionalEncoding(
34
+ (dropout): Dropout(p=0.2, inplace=False)
35
+ )
36
+ )
37
+ (encoders): MultiSequential(
38
+ (0): EncoderLayer(
39
+ (self_attn): MultiHeadedAttention(
40
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
41
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
42
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
43
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
44
+ (dropout): Dropout(p=0.2, inplace=False)
45
+ )
46
+ (feed_forward): MultiLayeredConv1d(
47
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
48
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
49
+ (dropout): Dropout(p=0.2, inplace=False)
50
+ )
51
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
52
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
53
+ (dropout): Dropout(p=0.2, inplace=False)
54
+ )
55
+ (1): EncoderLayer(
56
+ (self_attn): MultiHeadedAttention(
57
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
58
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
59
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
60
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
61
+ (dropout): Dropout(p=0.2, inplace=False)
62
+ )
63
+ (feed_forward): MultiLayeredConv1d(
64
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
65
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
66
+ (dropout): Dropout(p=0.2, inplace=False)
67
+ )
68
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
69
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
70
+ (dropout): Dropout(p=0.2, inplace=False)
71
+ )
72
+ (2): EncoderLayer(
73
+ (self_attn): MultiHeadedAttention(
74
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
75
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
76
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
77
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
78
+ (dropout): Dropout(p=0.2, inplace=False)
79
+ )
80
+ (feed_forward): MultiLayeredConv1d(
81
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
82
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
83
+ (dropout): Dropout(p=0.2, inplace=False)
84
+ )
85
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
86
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
87
+ (dropout): Dropout(p=0.2, inplace=False)
88
+ )
89
+ (3): EncoderLayer(
90
+ (self_attn): MultiHeadedAttention(
91
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
92
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
93
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
94
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
95
+ (dropout): Dropout(p=0.2, inplace=False)
96
+ )
97
+ (feed_forward): MultiLayeredConv1d(
98
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
99
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
100
+ (dropout): Dropout(p=0.2, inplace=False)
101
+ )
102
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
103
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
104
+ (dropout): Dropout(p=0.2, inplace=False)
105
+ )
106
+ )
107
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
108
+ )
109
+ (duration_predictor): DurationPredictor(
110
+ (conv): ModuleList(
111
+ (0): Sequential(
112
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
113
+ (1): ReLU()
114
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
115
+ (3): Dropout(p=0.1, inplace=False)
116
+ )
117
+ (1): Sequential(
118
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
119
+ (1): ReLU()
120
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
121
+ (3): Dropout(p=0.1, inplace=False)
122
+ )
123
+ )
124
+ (linear): Linear(in_features=256, out_features=1, bias=True)
125
+ )
126
+ (pitch_predictor): VariancePredictor(
127
+ (conv): ModuleList(
128
+ (0): Sequential(
129
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
130
+ (1): ReLU()
131
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
132
+ (3): Dropout(p=0.5, inplace=False)
133
+ )
134
+ (1): Sequential(
135
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
136
+ (1): ReLU()
137
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
138
+ (3): Dropout(p=0.5, inplace=False)
139
+ )
140
+ (2): Sequential(
141
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
142
+ (1): ReLU()
143
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
144
+ (3): Dropout(p=0.5, inplace=False)
145
+ )
146
+ (3): Sequential(
147
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
148
+ (1): ReLU()
149
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
150
+ (3): Dropout(p=0.5, inplace=False)
151
+ )
152
+ (4): Sequential(
153
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
154
+ (1): ReLU()
155
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
156
+ (3): Dropout(p=0.5, inplace=False)
157
+ )
158
+ )
159
+ (linear): Linear(in_features=256, out_features=1, bias=True)
160
+ )
161
+ (pitch_embed): Sequential(
162
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
163
+ (1): Dropout(p=0.0, inplace=False)
164
+ )
165
+ (energy_predictor): VariancePredictor(
166
+ (conv): ModuleList(
167
+ (0): Sequential(
168
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
169
+ (1): ReLU()
170
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
171
+ (3): Dropout(p=0.5, inplace=False)
172
+ )
173
+ (1): Sequential(
174
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
175
+ (1): ReLU()
176
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
177
+ (3): Dropout(p=0.5, inplace=False)
178
+ )
179
+ )
180
+ (linear): Linear(in_features=256, out_features=1, bias=True)
181
+ )
182
+ (energy_embed): Sequential(
183
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
184
+ (1): Dropout(p=0.0, inplace=False)
185
+ )
186
+ (alignment_module): AlignmentModule(
187
+ (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
188
+ (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
189
+ (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,))
190
+ (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
191
+ (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
192
+ )
193
+ (length_regulator): GaussianUpsampling()
194
+ (decoder): Encoder(
195
+ (embed): Sequential(
196
+ (0): ScaledPositionalEncoding(
197
+ (dropout): Dropout(p=0.2, inplace=False)
198
+ )
199
+ )
200
+ (encoders): MultiSequential(
201
+ (0): EncoderLayer(
202
+ (self_attn): MultiHeadedAttention(
203
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
204
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
205
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
206
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
207
+ (dropout): Dropout(p=0.2, inplace=False)
208
+ )
209
+ (feed_forward): MultiLayeredConv1d(
210
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
211
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
212
+ (dropout): Dropout(p=0.2, inplace=False)
213
+ )
214
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
215
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
216
+ (dropout): Dropout(p=0.2, inplace=False)
217
+ )
218
+ (1): EncoderLayer(
219
+ (self_attn): MultiHeadedAttention(
220
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
221
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
222
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
223
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
224
+ (dropout): Dropout(p=0.2, inplace=False)
225
+ )
226
+ (feed_forward): MultiLayeredConv1d(
227
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
228
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
229
+ (dropout): Dropout(p=0.2, inplace=False)
230
+ )
231
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
232
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
233
+ (dropout): Dropout(p=0.2, inplace=False)
234
+ )
235
+ (2): EncoderLayer(
236
+ (self_attn): MultiHeadedAttention(
237
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
238
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
239
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
240
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
241
+ (dropout): Dropout(p=0.2, inplace=False)
242
+ )
243
+ (feed_forward): MultiLayeredConv1d(
244
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
245
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
246
+ (dropout): Dropout(p=0.2, inplace=False)
247
+ )
248
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
249
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
250
+ (dropout): Dropout(p=0.2, inplace=False)
251
+ )
252
+ (3): EncoderLayer(
253
+ (self_attn): MultiHeadedAttention(
254
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
255
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
256
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
257
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
258
+ (dropout): Dropout(p=0.2, inplace=False)
259
+ )
260
+ (feed_forward): MultiLayeredConv1d(
261
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
262
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
263
+ (dropout): Dropout(p=0.2, inplace=False)
264
+ )
265
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
266
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
267
+ (dropout): Dropout(p=0.2, inplace=False)
268
+ )
269
+ )
270
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
271
+ )
272
+ (generator): HiFiGANGenerator(
273
+ (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,))
274
+ (upsamples): ModuleList(
275
+ (0): Sequential(
276
+ (0): LeakyReLU(negative_slope=0.1)
277
+ (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
278
+ )
279
+ (1): Sequential(
280
+ (0): LeakyReLU(negative_slope=0.1)
281
+ (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
282
+ )
283
+ (2): Sequential(
284
+ (0): LeakyReLU(negative_slope=0.1)
285
+ (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
286
+ )
287
+ (3): Sequential(
288
+ (0): LeakyReLU(negative_slope=0.1)
289
+ (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
290
+ )
291
+ )
292
+ (blocks): ModuleList(
293
+ (0): ResidualBlock(
294
+ (convs1): ModuleList(
295
+ (0): Sequential(
296
+ (0): LeakyReLU(negative_slope=0.1)
297
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
298
+ )
299
+ (1): Sequential(
300
+ (0): LeakyReLU(negative_slope=0.1)
301
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
302
+ )
303
+ (2): Sequential(
304
+ (0): LeakyReLU(negative_slope=0.1)
305
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
306
+ )
307
+ )
308
+ (convs2): ModuleList(
309
+ (0): Sequential(
310
+ (0): LeakyReLU(negative_slope=0.1)
311
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
312
+ )
313
+ (1): Sequential(
314
+ (0): LeakyReLU(negative_slope=0.1)
315
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
316
+ )
317
+ (2): Sequential(
318
+ (0): LeakyReLU(negative_slope=0.1)
319
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
320
+ )
321
+ )
322
+ )
323
+ (1): ResidualBlock(
324
+ (convs1): ModuleList(
325
+ (0): Sequential(
326
+ (0): LeakyReLU(negative_slope=0.1)
327
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
328
+ )
329
+ (1): Sequential(
330
+ (0): LeakyReLU(negative_slope=0.1)
331
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
332
+ )
333
+ (2): Sequential(
334
+ (0): LeakyReLU(negative_slope=0.1)
335
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
336
+ )
337
+ )
338
+ (convs2): ModuleList(
339
+ (0): Sequential(
340
+ (0): LeakyReLU(negative_slope=0.1)
341
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
342
+ )
343
+ (1): Sequential(
344
+ (0): LeakyReLU(negative_slope=0.1)
345
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
346
+ )
347
+ (2): Sequential(
348
+ (0): LeakyReLU(negative_slope=0.1)
349
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
350
+ )
351
+ )
352
+ )
353
+ (2): ResidualBlock(
354
+ (convs1): ModuleList(
355
+ (0): Sequential(
356
+ (0): LeakyReLU(negative_slope=0.1)
357
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
358
+ )
359
+ (1): Sequential(
360
+ (0): LeakyReLU(negative_slope=0.1)
361
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
362
+ )
363
+ (2): Sequential(
364
+ (0): LeakyReLU(negative_slope=0.1)
365
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
366
+ )
367
+ )
368
+ (convs2): ModuleList(
369
+ (0): Sequential(
370
+ (0): LeakyReLU(negative_slope=0.1)
371
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
372
+ )
373
+ (1): Sequential(
374
+ (0): LeakyReLU(negative_slope=0.1)
375
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
376
+ )
377
+ (2): Sequential(
378
+ (0): LeakyReLU(negative_slope=0.1)
379
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
380
+ )
381
+ )
382
+ )
383
+ (3): ResidualBlock(
384
+ (convs1): ModuleList(
385
+ (0): Sequential(
386
+ (0): LeakyReLU(negative_slope=0.1)
387
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
388
+ )
389
+ (1): Sequential(
390
+ (0): LeakyReLU(negative_slope=0.1)
391
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
392
+ )
393
+ (2): Sequential(
394
+ (0): LeakyReLU(negative_slope=0.1)
395
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
396
+ )
397
+ )
398
+ (convs2): ModuleList(
399
+ (0): Sequential(
400
+ (0): LeakyReLU(negative_slope=0.1)
401
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
402
+ )
403
+ (1): Sequential(
404
+ (0): LeakyReLU(negative_slope=0.1)
405
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
406
+ )
407
+ (2): Sequential(
408
+ (0): LeakyReLU(negative_slope=0.1)
409
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
410
+ )
411
+ )
412
+ )
413
+ (4): ResidualBlock(
414
+ (convs1): ModuleList(
415
+ (0): Sequential(
416
+ (0): LeakyReLU(negative_slope=0.1)
417
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
418
+ )
419
+ (1): Sequential(
420
+ (0): LeakyReLU(negative_slope=0.1)
421
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
422
+ )
423
+ (2): Sequential(
424
+ (0): LeakyReLU(negative_slope=0.1)
425
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
426
+ )
427
+ )
428
+ (convs2): ModuleList(
429
+ (0): Sequential(
430
+ (0): LeakyReLU(negative_slope=0.1)
431
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
432
+ )
433
+ (1): Sequential(
434
+ (0): LeakyReLU(negative_slope=0.1)
435
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
436
+ )
437
+ (2): Sequential(
438
+ (0): LeakyReLU(negative_slope=0.1)
439
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
440
+ )
441
+ )
442
+ )
443
+ (5): ResidualBlock(
444
+ (convs1): ModuleList(
445
+ (0): Sequential(
446
+ (0): LeakyReLU(negative_slope=0.1)
447
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
448
+ )
449
+ (1): Sequential(
450
+ (0): LeakyReLU(negative_slope=0.1)
451
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
452
+ )
453
+ (2): Sequential(
454
+ (0): LeakyReLU(negative_slope=0.1)
455
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
456
+ )
457
+ )
458
+ (convs2): ModuleList(
459
+ (0): Sequential(
460
+ (0): LeakyReLU(negative_slope=0.1)
461
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
462
+ )
463
+ (1): Sequential(
464
+ (0): LeakyReLU(negative_slope=0.1)
465
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
466
+ )
467
+ (2): Sequential(
468
+ (0): LeakyReLU(negative_slope=0.1)
469
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
470
+ )
471
+ )
472
+ )
473
+ (6): ResidualBlock(
474
+ (convs1): ModuleList(
475
+ (0): Sequential(
476
+ (0): LeakyReLU(negative_slope=0.1)
477
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
478
+ )
479
+ (1): Sequential(
480
+ (0): LeakyReLU(negative_slope=0.1)
481
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
482
+ )
483
+ (2): Sequential(
484
+ (0): LeakyReLU(negative_slope=0.1)
485
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
486
+ )
487
+ )
488
+ (convs2): ModuleList(
489
+ (0): Sequential(
490
+ (0): LeakyReLU(negative_slope=0.1)
491
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
492
+ )
493
+ (1): Sequential(
494
+ (0): LeakyReLU(negative_slope=0.1)
495
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
496
+ )
497
+ (2): Sequential(
498
+ (0): LeakyReLU(negative_slope=0.1)
499
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
500
+ )
501
+ )
502
+ )
503
+ (7): ResidualBlock(
504
+ (convs1): ModuleList(
505
+ (0): Sequential(
506
+ (0): LeakyReLU(negative_slope=0.1)
507
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
508
+ )
509
+ (1): Sequential(
510
+ (0): LeakyReLU(negative_slope=0.1)
511
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
512
+ )
513
+ (2): Sequential(
514
+ (0): LeakyReLU(negative_slope=0.1)
515
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
516
+ )
517
+ )
518
+ (convs2): ModuleList(
519
+ (0): Sequential(
520
+ (0): LeakyReLU(negative_slope=0.1)
521
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
522
+ )
523
+ (1): Sequential(
524
+ (0): LeakyReLU(negative_slope=0.1)
525
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
526
+ )
527
+ (2): Sequential(
528
+ (0): LeakyReLU(negative_slope=0.1)
529
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
530
+ )
531
+ )
532
+ )
533
+ (8): ResidualBlock(
534
+ (convs1): ModuleList(
535
+ (0): Sequential(
536
+ (0): LeakyReLU(negative_slope=0.1)
537
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
538
+ )
539
+ (1): Sequential(
540
+ (0): LeakyReLU(negative_slope=0.1)
541
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
542
+ )
543
+ (2): Sequential(
544
+ (0): LeakyReLU(negative_slope=0.1)
545
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
546
+ )
547
+ )
548
+ (convs2): ModuleList(
549
+ (0): Sequential(
550
+ (0): LeakyReLU(negative_slope=0.1)
551
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
552
+ )
553
+ (1): Sequential(
554
+ (0): LeakyReLU(negative_slope=0.1)
555
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
556
+ )
557
+ (2): Sequential(
558
+ (0): LeakyReLU(negative_slope=0.1)
559
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
560
+ )
561
+ )
562
+ )
563
+ (9): ResidualBlock(
564
+ (convs1): ModuleList(
565
+ (0): Sequential(
566
+ (0): LeakyReLU(negative_slope=0.1)
567
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
568
+ )
569
+ (1): Sequential(
570
+ (0): LeakyReLU(negative_slope=0.1)
571
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
572
+ )
573
+ (2): Sequential(
574
+ (0): LeakyReLU(negative_slope=0.1)
575
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
576
+ )
577
+ )
578
+ (convs2): ModuleList(
579
+ (0): Sequential(
580
+ (0): LeakyReLU(negative_slope=0.1)
581
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
582
+ )
583
+ (1): Sequential(
584
+ (0): LeakyReLU(negative_slope=0.1)
585
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
586
+ )
587
+ (2): Sequential(
588
+ (0): LeakyReLU(negative_slope=0.1)
589
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
590
+ )
591
+ )
592
+ )
593
+ (10): ResidualBlock(
594
+ (convs1): ModuleList(
595
+ (0): Sequential(
596
+ (0): LeakyReLU(negative_slope=0.1)
597
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
598
+ )
599
+ (1): Sequential(
600
+ (0): LeakyReLU(negative_slope=0.1)
601
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
602
+ )
603
+ (2): Sequential(
604
+ (0): LeakyReLU(negative_slope=0.1)
605
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
606
+ )
607
+ )
608
+ (convs2): ModuleList(
609
+ (0): Sequential(
610
+ (0): LeakyReLU(negative_slope=0.1)
611
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
612
+ )
613
+ (1): Sequential(
614
+ (0): LeakyReLU(negative_slope=0.1)
615
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
616
+ )
617
+ (2): Sequential(
618
+ (0): LeakyReLU(negative_slope=0.1)
619
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
620
+ )
621
+ )
622
+ )
623
+ (11): ResidualBlock(
624
+ (convs1): ModuleList(
625
+ (0): Sequential(
626
+ (0): LeakyReLU(negative_slope=0.1)
627
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
628
+ )
629
+ (1): Sequential(
630
+ (0): LeakyReLU(negative_slope=0.1)
631
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
632
+ )
633
+ (2): Sequential(
634
+ (0): LeakyReLU(negative_slope=0.1)
635
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
636
+ )
637
+ )
638
+ (convs2): ModuleList(
639
+ (0): Sequential(
640
+ (0): LeakyReLU(negative_slope=0.1)
641
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
642
+ )
643
+ (1): Sequential(
644
+ (0): LeakyReLU(negative_slope=0.1)
645
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
646
+ )
647
+ (2): Sequential(
648
+ (0): LeakyReLU(negative_slope=0.1)
649
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
650
+ )
651
+ )
652
+ )
653
+ )
654
+ (output_conv): Sequential(
655
+ (0): LeakyReLU(negative_slope=0.01)
656
+ (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
657
+ (2): Tanh()
658
+ )
659
+ )
660
+ )
661
+ (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
662
+ (msd): HiFiGANMultiScaleDiscriminator(
663
+ (discriminators): ModuleList(
664
+ (0): HiFiGANScaleDiscriminator(
665
+ (layers): ModuleList(
666
+ (0): Sequential(
667
+ (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
668
+ (1): LeakyReLU(negative_slope=0.1)
669
+ )
670
+ (1): Sequential(
671
+ (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
672
+ (1): LeakyReLU(negative_slope=0.1)
673
+ )
674
+ (2): Sequential(
675
+ (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
676
+ (1): LeakyReLU(negative_slope=0.1)
677
+ )
678
+ (3): Sequential(
679
+ (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
680
+ (1): LeakyReLU(negative_slope=0.1)
681
+ )
682
+ (4): Sequential(
683
+ (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
684
+ (1): LeakyReLU(negative_slope=0.1)
685
+ )
686
+ (5): Sequential(
687
+ (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
688
+ (1): LeakyReLU(negative_slope=0.1)
689
+ )
690
+ (6): Sequential(
691
+ (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
692
+ (1): LeakyReLU(negative_slope=0.1)
693
+ )
694
+ (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
695
+ )
696
+ )
697
+ )
698
+ )
699
+ (mpd): HiFiGANMultiPeriodDiscriminator(
700
+ (discriminators): ModuleList(
701
+ (0): HiFiGANPeriodDiscriminator(
702
+ (convs): ModuleList(
703
+ (0): Sequential(
704
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
705
+ (1): LeakyReLU(negative_slope=0.1)
706
+ )
707
+ (1): Sequential(
708
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
709
+ (1): LeakyReLU(negative_slope=0.1)
710
+ )
711
+ (2): Sequential(
712
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
713
+ (1): LeakyReLU(negative_slope=0.1)
714
+ )
715
+ (3): Sequential(
716
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
717
+ (1): LeakyReLU(negative_slope=0.1)
718
+ )
719
+ (4): Sequential(
720
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
721
+ (1): LeakyReLU(negative_slope=0.1)
722
+ )
723
+ )
724
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
725
+ )
726
+ (1): HiFiGANPeriodDiscriminator(
727
+ (convs): ModuleList(
728
+ (0): Sequential(
729
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
730
+ (1): LeakyReLU(negative_slope=0.1)
731
+ )
732
+ (1): Sequential(
733
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
734
+ (1): LeakyReLU(negative_slope=0.1)
735
+ )
736
+ (2): Sequential(
737
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
738
+ (1): LeakyReLU(negative_slope=0.1)
739
+ )
740
+ (3): Sequential(
741
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
742
+ (1): LeakyReLU(negative_slope=0.1)
743
+ )
744
+ (4): Sequential(
745
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
746
+ (1): LeakyReLU(negative_slope=0.1)
747
+ )
748
+ )
749
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
750
+ )
751
+ (2): HiFiGANPeriodDiscriminator(
752
+ (convs): ModuleList(
753
+ (0): Sequential(
754
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
755
+ (1): LeakyReLU(negative_slope=0.1)
756
+ )
757
+ (1): Sequential(
758
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
759
+ (1): LeakyReLU(negative_slope=0.1)
760
+ )
761
+ (2): Sequential(
762
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
763
+ (1): LeakyReLU(negative_slope=0.1)
764
+ )
765
+ (3): Sequential(
766
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
767
+ (1): LeakyReLU(negative_slope=0.1)
768
+ )
769
+ (4): Sequential(
770
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
771
+ (1): LeakyReLU(negative_slope=0.1)
772
+ )
773
+ )
774
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
775
+ )
776
+ (3): HiFiGANPeriodDiscriminator(
777
+ (convs): ModuleList(
778
+ (0): Sequential(
779
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
780
+ (1): LeakyReLU(negative_slope=0.1)
781
+ )
782
+ (1): Sequential(
783
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
784
+ (1): LeakyReLU(negative_slope=0.1)
785
+ )
786
+ (2): Sequential(
787
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
788
+ (1): LeakyReLU(negative_slope=0.1)
789
+ )
790
+ (3): Sequential(
791
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
792
+ (1): LeakyReLU(negative_slope=0.1)
793
+ )
794
+ (4): Sequential(
795
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
796
+ (1): LeakyReLU(negative_slope=0.1)
797
+ )
798
+ )
799
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
800
+ )
801
+ (4): HiFiGANPeriodDiscriminator(
802
+ (convs): ModuleList(
803
+ (0): Sequential(
804
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
805
+ (1): LeakyReLU(negative_slope=0.1)
806
+ )
807
+ (1): Sequential(
808
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
809
+ (1): LeakyReLU(negative_slope=0.1)
810
+ )
811
+ (2): Sequential(
812
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
813
+ (1): LeakyReLU(negative_slope=0.1)
814
+ )
815
+ (3): Sequential(
816
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
817
+ (1): LeakyReLU(negative_slope=0.1)
818
+ )
819
+ (4): Sequential(
820
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
821
+ (1): LeakyReLU(negative_slope=0.1)
822
+ )
823
+ )
824
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
825
+ )
826
+ )
827
+ )
828
+ )
829
+ (generator_adv_loss): GeneratorAdversarialLoss()
830
+ (discriminator_adv_loss): DiscriminatorAdversarialLoss()
831
+ (feat_match_loss): FeatureMatchLoss()
832
+ (mel_loss): MelSpectrogramLoss(
833
+ (wav_to_mel): LogMelFbank(
834
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
835
+ (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False)
836
+ )
837
+ )
838
+ (var_loss): VarianceLoss(
839
+ (mse_criterion): MSELoss()
840
+ (duration_criterion): DurationPredictorLoss(
841
+ (criterion): MSELoss()
842
+ )
843
+ )
844
+ (forwardsum_loss): ForwardSumLoss()
845
+ )
846
+ )
847
+
848
+ Model summary:
849
+ Class Name: ESPnetGANTTSModel
850
+ Total Number of model parameters: 83.28 M
851
+ Number of trainable parameters: 83.28 M (100.0%)
852
+ Size: 333.11 MB
853
+ Type: torch.float32
854
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,651 (abs_task:1161) INFO: Optimizer:
855
+ AdamW (
856
+ Parameter Group 0
857
+ amsgrad: False
858
+ betas: [0.8, 0.99]
859
+ eps: 1e-09
860
+ initial_lr: 0.0002
861
+ lr: 0.0002
862
+ weight_decay: 0.0
863
+ )
864
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,651 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f1cdc2a85b0>
865
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,651 (abs_task:1161) INFO: Optimizer2:
866
+ AdamW (
867
+ Parameter Group 0
868
+ amsgrad: False
869
+ betas: [0.8, 0.99]
870
+ eps: 1e-09
871
+ initial_lr: 0.0002
872
+ lr: 0.0002
873
+ weight_decay: 0.0
874
+ )
875
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,652 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f1de6a6b9d0>
876
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,652 (abs_task:1171) INFO: Saving the configuration in exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml
877
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,974 (abs_task:1525) INFO: [train] dataset:
878
+ ESPnetDataset(
879
+ text: {"path": "dump/raw/jvs010_tr_no_dev/text", "type": "text"}
880
+ speech: {"path": "dump/raw/jvs010_tr_no_dev/wav.scp", "type": "sound"}
881
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f1cdc2a8e20>)
882
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,974 (abs_task:1526) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=3, batch_bins=9000000, sort_in_batch=descending, sort_batch=descending)
883
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,974 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=3, mean=33.3, min=6, max=53
884
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,994 (abs_task:1525) INFO: [valid] dataset:
885
+ ESPnetDataset(
886
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
887
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
888
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f1cdc2a8550>)
889
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,994 (abs_task:1526) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=1, batch_bins=9000000, sort_in_batch=descending, sort_batch=descending)
890
+ [92b100c97f43:0/4] 2025-03-04 21:35:42,994 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=1, mean=15.0, min=15, max=15
891
+ [92b100c97f43:0/4] 2025-03-04 21:35:43,014 (abs_task:1525) INFO: [plot_att] dataset:
892
+ ESPnetDataset(
893
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
894
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
895
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f1cdc264190>)
896
+ [92b100c97f43:0/4] 2025-03-04 21:35:43,014 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15, batch_size=1, key_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn,
897
+ [92b100c97f43:0/4] 2025-03-04 21:35:43,014 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1
898
+ 92b100c97f43:1324139:1324139 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
899
+ 92b100c97f43:1324139:1324139 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
900
+
901
+ 92b100c97f43:1324139:1324139 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
902
+ 92b100c97f43:1324139:1324139 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
903
+ 92b100c97f43:1324139:1324139 [0] NCCL INFO Using network Socket
904
+ NCCL version 2.10.3+cuda11.3
905
+ 92b100c97f43:1324142:1324142 [3] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
906
+ 92b100c97f43:1324140:1324140 [1] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
907
+ 92b100c97f43:1324142:1324142 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
908
+ 92b100c97f43:1324140:1324140 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
909
+
910
+ 92b100c97f43:1324142:1324142 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
911
+
912
+ 92b100c97f43:1324140:1324140 [1] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
913
+ 92b100c97f43:1324142:1324142 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
914
+ 92b100c97f43:1324140:1324140 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
915
+ 92b100c97f43:1324142:1324142 [3] NCCL INFO Using network Socket
916
+ 92b100c97f43:1324140:1324140 [1] NCCL INFO Using network Socket
917
+ 92b100c97f43:1324141:1324141 [2] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
918
+ 92b100c97f43:1324141:1324141 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
919
+
920
+ 92b100c97f43:1324141:1324141 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
921
+ 92b100c97f43:1324141:1324141 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
922
+ 92b100c97f43:1324141:1324141 [2] NCCL INFO Using network Socket
923
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Channel 00/02 : 0 1 2 3
924
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
925
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Channel 01/02 : 0 1 2 3
926
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
927
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
928
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
929
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff
930
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff
931
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff
932
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff
933
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Channel 00 : 1[40] -> 2[50] via direct shared memory
934
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Channel 00 : 3[60] -> 0[30] via direct shared memory
935
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Channel 01 : 1[40] -> 2[50] via direct shared memory
936
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Channel 01 : 3[60] -> 0[30] via direct shared memory
937
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Channel 00 : 2[50] -> 3[60] via direct shared memory
938
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Channel 00 : 0[30] -> 1[40] via direct shared memory
939
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Channel 01 : 2[50] -> 3[60] via direct shared memory
940
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Channel 01 : 0[30] -> 1[40] via direct shared memory
941
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Connected all rings
942
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Connected all rings
943
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Channel 00 : 3[60] -> 2[50] via direct shared memory
944
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Channel 01 : 3[60] -> 2[50] via direct shared memory
945
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Connected all rings
946
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Connected all rings
947
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Channel 00 : 1[40] -> 0[30] via direct shared memory
948
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Channel 01 : 1[40] -> 0[30] via direct shared memory
949
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Channel 00 : 2[50] -> 1[40] via direct shared memory
950
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Channel 01 : 2[50] -> 1[40] via direct shared memory
951
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO Connected all trees
952
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
953
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
954
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO Connected all trees
955
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
956
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
957
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO Connected all trees
958
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
959
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
960
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO Connected all trees
961
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
962
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
963
+ 92b100c97f43:1324140:1324178 [1] NCCL INFO comm 0x7ff3c40030d0 rank 1 nranks 4 cudaDev 1 busId 40 - Init COMPLETE
964
+ 92b100c97f43:1324142:1324179 [3] NCCL INFO comm 0x7f4ea80030d0 rank 3 nranks 4 cudaDev 3 busId 60 - Init COMPLETE
965
+ 92b100c97f43:1324139:1324177 [0] NCCL INFO comm 0x7f1bec0030d0 rank 0 nranks 4 cudaDev 0 busId 30 - Init COMPLETE
966
+ 92b100c97f43:1324139:1324139 [0] NCCL INFO Launch mode Parallel
967
+ 92b100c97f43:1324141:1324180 [2] NCCL INFO comm 0x7f0acc0030d0 rank 2 nranks 4 cudaDev 2 busId 50 - Init COMPLETE
968
+ [92b100c97f43:0/4] 2025-03-04 21:35:43,535 (trainer:280) INFO: 1/130epoch started
969
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
970
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
971
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
972
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
973
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
974
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
975
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
976
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
977
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
978
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
979
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
980
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
981
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
982
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
983
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
984
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
985
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
986
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
987
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
988
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
989
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
990
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
991
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
992
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
993
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
994
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
995
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
996
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
997
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
998
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
999
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1000
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1001
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1002
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1003
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1004
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1005
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1006
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1007
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1008
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1009
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1010
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1011
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1012
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1013
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1014
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1015
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1016
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1017
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1018
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1019
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1020
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1021
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1022
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1023
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1024
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1025
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1026
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1027
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1028
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1029
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1030
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1031
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1032
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1033
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1034
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1035
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1036
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1037
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1038
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1039
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1040
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1041
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1042
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1043
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1044
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1045
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1046
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1047
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1048
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1049
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1050
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1051
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1052
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1053
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1054
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1055
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1056
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1057
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1058
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1059
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1060
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1061
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1062
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1063
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1064
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1065
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1066
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1067
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1068
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1069
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1070
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1071
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1072
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1073
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1074
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1075
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1076
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1077
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1078
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1079
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1080
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1081
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1082
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1083
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1084
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1085
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1086
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1087
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1088
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1089
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1090
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1091
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1092
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1093
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1094
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1095
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1096
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1097
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1098
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1099
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1100
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1101
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1102
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1103
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1104
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1105
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1106
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1107
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1108
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1109
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1110
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1111
+ Process SpawnProcess-3:
1112
+ Traceback (most recent call last):
1113
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1114
+ self.run()
1115
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1116
+ self._target(*self._args, **self._kwargs)
1117
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1118
+ cls.trainer.run(
1119
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1120
+ all_steps_are_invalid = cls.train_one_epoch(
1121
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 160, in train_one_epoch
1122
+ retval = model(forward_generator=turn == "generator", **batch)
1123
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1124
+ return forward_call(*input, **kwargs)
1125
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 886, in forward
1126
+ output = self.module(*inputs[0], **kwargs[0])
1127
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1128
+ return forward_call(*input, **kwargs)
1129
+ File "/work/espnet/espnet2/gan_tts/espnet_model.py", line 164, in forward
1130
+ return self.tts(**batch)
1131
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1132
+ return forward_call(*input, **kwargs)
1133
+ File "/work/espnet/espnet2/gan_tts/jets/jets.py", line 339, in forward
1134
+ return self._forward_generator(
1135
+ File "/work/espnet/espnet2/gan_tts/jets/jets.py", line 452, in _forward_generator
1136
+ feat_match_loss = self.feat_match_loss(p_hat, p)
1137
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1138
+ return forward_call(*input, **kwargs)
1139
+ File "/work/espnet/espnet2/gan_tts/hifigan/loss.py", line 210, in forward
1140
+ feat_match_loss_ += F.l1_loss(feat_hat_, feat_.detach())
1141
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 3081, in l1_loss
1142
+ return torch._C._nn.l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
1143
+ RuntimeError: CUDA out of memory. Tried to allocate 80.00 MiB (GPU 2; 21.96 GiB total capacity; 7.64 GiB already allocated; 66.88 MiB free; 7.94 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
1144
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1145
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1146
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1147
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1148
+ Process SpawnProcess-1:
1149
+ Traceback (most recent call last):
1150
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1151
+ self.run()
1152
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1153
+ self._target(*self._args, **self._kwargs)
1154
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1155
+ cls.trainer.run(
1156
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1157
+ all_steps_are_invalid = cls.train_one_epoch(
1158
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 160, in train_one_epoch
1159
+ retval = model(forward_generator=turn == "generator", **batch)
1160
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1161
+ return forward_call(*input, **kwargs)
1162
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 886, in forward
1163
+ output = self.module(*inputs[0], **kwargs[0])
1164
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1165
+ return forward_call(*input, **kwargs)
1166
+ File "/work/espnet/espnet2/gan_tts/espnet_model.py", line 164, in forward
1167
+ return self.tts(**batch)
1168
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1169
+ return forward_call(*input, **kwargs)
1170
+ File "/work/espnet/espnet2/gan_tts/jets/jets.py", line 339, in forward
1171
+ return self._forward_generator(
1172
+ File "/work/espnet/espnet2/gan_tts/jets/jets.py", line 407, in _forward_generator
1173
+ outs = self.generator(
1174
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1175
+ return forward_call(*input, **kwargs)
1176
+ File "/work/espnet/espnet2/gan_tts/jets/generator.py", line 626, in forward
1177
+ wav = self.generator(z_segments)
1178
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1179
+ return forward_call(*input, **kwargs)
1180
+ File "/work/espnet/espnet2/gan_tts/hifigan/hifigan.py", line 160, in forward
1181
+ cs += self.blocks[i * self.num_blocks + j](c)
1182
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1183
+ return forward_call(*input, **kwargs)
1184
+ File "/work/espnet/espnet2/gan_tts/hifigan/residual_block.py", line 97, in forward
1185
+ xt = self.convs2[idx](xt)
1186
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1187
+ return forward_call(*input, **kwargs)
1188
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/container.py", line 141, in forward
1189
+ input = module(input)
1190
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
1191
+ return forward_call(*input, **kwargs)
1192
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/activation.py", line 738, in forward
1193
+ return F.leaky_relu(input, self.negative_slope, self.inplace)
1194
+ File "/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py", line 1475, in leaky_relu
1195
+ result = torch._C._nn.leaky_relu(input, negative_slope)
1196
+ RuntimeError: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 21.96 GiB total capacity; 6.66 GiB already allocated; 3.50 MiB free; 6.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
1197
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1198
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1199
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1200
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1201
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1202
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1203
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1204
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1205
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1206
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1207
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1208
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1209
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1210
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1211
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1212
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1213
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1214
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1215
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1216
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1217
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1218
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1219
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1220
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1221
+ Traceback (most recent call last):
1222
+ File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
1223
+ return _run_code(code, main_globals, None,
1224
+ File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
1225
+ exec(code, run_globals)
1226
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 22, in <module>
1227
+ main()
1228
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 18, in main
1229
+ GANTTSTask.main(cmd=cmd)
1230
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1069, in main
1231
+ while not ProcessContext(processes, error_queues).join():
1232
+ File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 139, in join
1233
+ raise ProcessExitedException(
1234
+ torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1
1235
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1236
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1237
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1238
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1239
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1240
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1241
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1242
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1243
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1244
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1245
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1246
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1247
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1248
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1249
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1250
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1251
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1252
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1253
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1254
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1255
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1256
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1257
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1258
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1259
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1260
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1261
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1262
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1263
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1264
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1265
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1266
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1267
+ # Accounting: time=55 threads=1
1268
+ # Ended (code 1) at Tue Mar 4 21:36:08 JST 2025, elapsed time 55 seconds
1269
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1270
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1271
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1272
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1273
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1274
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1275
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1276
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1277
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1278
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1279
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1280
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1281
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1282
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1283
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1284
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1285
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1286
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1287
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1288
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1289
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1290
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1291
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1292
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1293
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1294
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1295
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1296
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1297
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1298
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1299
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1300
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1301
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1302
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1303
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1304
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1305
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1306
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1307
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1308
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1309
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1310
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1311
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1312
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1313
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1314
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1315
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1316
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1317
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1318
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1319
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1320
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1321
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1322
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1323
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1324
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1325
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1326
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1327
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1328
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1329
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1330
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1331
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1332
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1333
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1334
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1335
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1336
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1337
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1338
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1339
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1340
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1341
+ /usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 208 leaked semaphore objects to clean up at shutdown
1342
+ warnings.warn('resource_tracker: There appear to be %d '
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.2.log ADDED
The diff for this file is too large to render. See raw diff
 
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.3.log ADDED
@@ -0,0 +1,1247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
2
+ # Started at Tue Mar 4 21:30:32 JST 2025
3
+ #
4
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
+ /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
7
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
8
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
9
+ [92b100c97f43:0/4] 2025-03-04 21:30:38,282 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
10
+ [92b100c97f43:0/4] 2025-03-04 21:30:38,292 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
11
+ [92b100c97f43:0/4] 2025-03-04 21:30:38,340 (gan_tts:304) INFO: Vocabulary size: 41
12
+ [92b100c97f43:0/4] 2025-03-04 21:30:38,470 (encoder:172) INFO: encoder self-attention layer type = self-attention
13
+ [92b100c97f43:0/4] 2025-03-04 21:30:38,693 (encoder:172) INFO: encoder self-attention layer type = self-attention
14
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,860 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
15
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,870 (abs_task:1158) INFO: Model structure:
16
+ ESPnetGANTTSModel(
17
+ (feats_extract): LogMelFbank(
18
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
19
+ (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False)
20
+ )
21
+ (normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz, norm_means=True, norm_vars=True)
22
+ (pitch_extract): Dio()
23
+ (pitch_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz, norm_means=True, norm_vars=True)
24
+ (energy_extract): Energy(
25
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
26
+ )
27
+ (energy_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz, norm_means=True, norm_vars=True)
28
+ (tts): JETS(
29
+ (generator): JETSGenerator(
30
+ (encoder): Encoder(
31
+ (embed): Sequential(
32
+ (0): Embedding(41, 256, padding_idx=0)
33
+ (1): ScaledPositionalEncoding(
34
+ (dropout): Dropout(p=0.2, inplace=False)
35
+ )
36
+ )
37
+ (encoders): MultiSequential(
38
+ (0): EncoderLayer(
39
+ (self_attn): MultiHeadedAttention(
40
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
41
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
42
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
43
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
44
+ (dropout): Dropout(p=0.2, inplace=False)
45
+ )
46
+ (feed_forward): MultiLayeredConv1d(
47
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
48
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
49
+ (dropout): Dropout(p=0.2, inplace=False)
50
+ )
51
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
52
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
53
+ (dropout): Dropout(p=0.2, inplace=False)
54
+ )
55
+ (1): EncoderLayer(
56
+ (self_attn): MultiHeadedAttention(
57
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
58
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
59
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
60
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
61
+ (dropout): Dropout(p=0.2, inplace=False)
62
+ )
63
+ (feed_forward): MultiLayeredConv1d(
64
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
65
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
66
+ (dropout): Dropout(p=0.2, inplace=False)
67
+ )
68
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
69
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
70
+ (dropout): Dropout(p=0.2, inplace=False)
71
+ )
72
+ (2): EncoderLayer(
73
+ (self_attn): MultiHeadedAttention(
74
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
75
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
76
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
77
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
78
+ (dropout): Dropout(p=0.2, inplace=False)
79
+ )
80
+ (feed_forward): MultiLayeredConv1d(
81
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
82
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
83
+ (dropout): Dropout(p=0.2, inplace=False)
84
+ )
85
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
86
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
87
+ (dropout): Dropout(p=0.2, inplace=False)
88
+ )
89
+ (3): EncoderLayer(
90
+ (self_attn): MultiHeadedAttention(
91
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
92
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
93
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
94
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
95
+ (dropout): Dropout(p=0.2, inplace=False)
96
+ )
97
+ (feed_forward): MultiLayeredConv1d(
98
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
99
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
100
+ (dropout): Dropout(p=0.2, inplace=False)
101
+ )
102
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
103
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
104
+ (dropout): Dropout(p=0.2, inplace=False)
105
+ )
106
+ )
107
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
108
+ )
109
+ (duration_predictor): DurationPredictor(
110
+ (conv): ModuleList(
111
+ (0): Sequential(
112
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
113
+ (1): ReLU()
114
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
115
+ (3): Dropout(p=0.1, inplace=False)
116
+ )
117
+ (1): Sequential(
118
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
119
+ (1): ReLU()
120
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
121
+ (3): Dropout(p=0.1, inplace=False)
122
+ )
123
+ )
124
+ (linear): Linear(in_features=256, out_features=1, bias=True)
125
+ )
126
+ (pitch_predictor): VariancePredictor(
127
+ (conv): ModuleList(
128
+ (0): Sequential(
129
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
130
+ (1): ReLU()
131
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
132
+ (3): Dropout(p=0.5, inplace=False)
133
+ )
134
+ (1): Sequential(
135
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
136
+ (1): ReLU()
137
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
138
+ (3): Dropout(p=0.5, inplace=False)
139
+ )
140
+ (2): Sequential(
141
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
142
+ (1): ReLU()
143
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
144
+ (3): Dropout(p=0.5, inplace=False)
145
+ )
146
+ (3): Sequential(
147
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
148
+ (1): ReLU()
149
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
150
+ (3): Dropout(p=0.5, inplace=False)
151
+ )
152
+ (4): Sequential(
153
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
154
+ (1): ReLU()
155
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
156
+ (3): Dropout(p=0.5, inplace=False)
157
+ )
158
+ )
159
+ (linear): Linear(in_features=256, out_features=1, bias=True)
160
+ )
161
+ (pitch_embed): Sequential(
162
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
163
+ (1): Dropout(p=0.0, inplace=False)
164
+ )
165
+ (energy_predictor): VariancePredictor(
166
+ (conv): ModuleList(
167
+ (0): Sequential(
168
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
169
+ (1): ReLU()
170
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
171
+ (3): Dropout(p=0.5, inplace=False)
172
+ )
173
+ (1): Sequential(
174
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
175
+ (1): ReLU()
176
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
177
+ (3): Dropout(p=0.5, inplace=False)
178
+ )
179
+ )
180
+ (linear): Linear(in_features=256, out_features=1, bias=True)
181
+ )
182
+ (energy_embed): Sequential(
183
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
184
+ (1): Dropout(p=0.0, inplace=False)
185
+ )
186
+ (alignment_module): AlignmentModule(
187
+ (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
188
+ (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
189
+ (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,))
190
+ (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
191
+ (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
192
+ )
193
+ (length_regulator): GaussianUpsampling()
194
+ (decoder): Encoder(
195
+ (embed): Sequential(
196
+ (0): ScaledPositionalEncoding(
197
+ (dropout): Dropout(p=0.2, inplace=False)
198
+ )
199
+ )
200
+ (encoders): MultiSequential(
201
+ (0): EncoderLayer(
202
+ (self_attn): MultiHeadedAttention(
203
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
204
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
205
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
206
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
207
+ (dropout): Dropout(p=0.2, inplace=False)
208
+ )
209
+ (feed_forward): MultiLayeredConv1d(
210
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
211
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
212
+ (dropout): Dropout(p=0.2, inplace=False)
213
+ )
214
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
215
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
216
+ (dropout): Dropout(p=0.2, inplace=False)
217
+ )
218
+ (1): EncoderLayer(
219
+ (self_attn): MultiHeadedAttention(
220
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
221
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
222
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
223
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
224
+ (dropout): Dropout(p=0.2, inplace=False)
225
+ )
226
+ (feed_forward): MultiLayeredConv1d(
227
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
228
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
229
+ (dropout): Dropout(p=0.2, inplace=False)
230
+ )
231
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
232
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
233
+ (dropout): Dropout(p=0.2, inplace=False)
234
+ )
235
+ (2): EncoderLayer(
236
+ (self_attn): MultiHeadedAttention(
237
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
238
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
239
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
240
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
241
+ (dropout): Dropout(p=0.2, inplace=False)
242
+ )
243
+ (feed_forward): MultiLayeredConv1d(
244
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
245
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
246
+ (dropout): Dropout(p=0.2, inplace=False)
247
+ )
248
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
249
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
250
+ (dropout): Dropout(p=0.2, inplace=False)
251
+ )
252
+ (3): EncoderLayer(
253
+ (self_attn): MultiHeadedAttention(
254
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
255
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
256
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
257
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
258
+ (dropout): Dropout(p=0.2, inplace=False)
259
+ )
260
+ (feed_forward): MultiLayeredConv1d(
261
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
262
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
263
+ (dropout): Dropout(p=0.2, inplace=False)
264
+ )
265
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
266
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
267
+ (dropout): Dropout(p=0.2, inplace=False)
268
+ )
269
+ )
270
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
271
+ )
272
+ (generator): HiFiGANGenerator(
273
+ (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,))
274
+ (upsamples): ModuleList(
275
+ (0): Sequential(
276
+ (0): LeakyReLU(negative_slope=0.1)
277
+ (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
278
+ )
279
+ (1): Sequential(
280
+ (0): LeakyReLU(negative_slope=0.1)
281
+ (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
282
+ )
283
+ (2): Sequential(
284
+ (0): LeakyReLU(negative_slope=0.1)
285
+ (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
286
+ )
287
+ (3): Sequential(
288
+ (0): LeakyReLU(negative_slope=0.1)
289
+ (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
290
+ )
291
+ )
292
+ (blocks): ModuleList(
293
+ (0): ResidualBlock(
294
+ (convs1): ModuleList(
295
+ (0): Sequential(
296
+ (0): LeakyReLU(negative_slope=0.1)
297
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
298
+ )
299
+ (1): Sequential(
300
+ (0): LeakyReLU(negative_slope=0.1)
301
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
302
+ )
303
+ (2): Sequential(
304
+ (0): LeakyReLU(negative_slope=0.1)
305
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
306
+ )
307
+ )
308
+ (convs2): ModuleList(
309
+ (0): Sequential(
310
+ (0): LeakyReLU(negative_slope=0.1)
311
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
312
+ )
313
+ (1): Sequential(
314
+ (0): LeakyReLU(negative_slope=0.1)
315
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
316
+ )
317
+ (2): Sequential(
318
+ (0): LeakyReLU(negative_slope=0.1)
319
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
320
+ )
321
+ )
322
+ )
323
+ (1): ResidualBlock(
324
+ (convs1): ModuleList(
325
+ (0): Sequential(
326
+ (0): LeakyReLU(negative_slope=0.1)
327
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
328
+ )
329
+ (1): Sequential(
330
+ (0): LeakyReLU(negative_slope=0.1)
331
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
332
+ )
333
+ (2): Sequential(
334
+ (0): LeakyReLU(negative_slope=0.1)
335
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
336
+ )
337
+ )
338
+ (convs2): ModuleList(
339
+ (0): Sequential(
340
+ (0): LeakyReLU(negative_slope=0.1)
341
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
342
+ )
343
+ (1): Sequential(
344
+ (0): LeakyReLU(negative_slope=0.1)
345
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
346
+ )
347
+ (2): Sequential(
348
+ (0): LeakyReLU(negative_slope=0.1)
349
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
350
+ )
351
+ )
352
+ )
353
+ (2): ResidualBlock(
354
+ (convs1): ModuleList(
355
+ (0): Sequential(
356
+ (0): LeakyReLU(negative_slope=0.1)
357
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
358
+ )
359
+ (1): Sequential(
360
+ (0): LeakyReLU(negative_slope=0.1)
361
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
362
+ )
363
+ (2): Sequential(
364
+ (0): LeakyReLU(negative_slope=0.1)
365
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
366
+ )
367
+ )
368
+ (convs2): ModuleList(
369
+ (0): Sequential(
370
+ (0): LeakyReLU(negative_slope=0.1)
371
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
372
+ )
373
+ (1): Sequential(
374
+ (0): LeakyReLU(negative_slope=0.1)
375
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
376
+ )
377
+ (2): Sequential(
378
+ (0): LeakyReLU(negative_slope=0.1)
379
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
380
+ )
381
+ )
382
+ )
383
+ (3): ResidualBlock(
384
+ (convs1): ModuleList(
385
+ (0): Sequential(
386
+ (0): LeakyReLU(negative_slope=0.1)
387
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
388
+ )
389
+ (1): Sequential(
390
+ (0): LeakyReLU(negative_slope=0.1)
391
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
392
+ )
393
+ (2): Sequential(
394
+ (0): LeakyReLU(negative_slope=0.1)
395
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
396
+ )
397
+ )
398
+ (convs2): ModuleList(
399
+ (0): Sequential(
400
+ (0): LeakyReLU(negative_slope=0.1)
401
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
402
+ )
403
+ (1): Sequential(
404
+ (0): LeakyReLU(negative_slope=0.1)
405
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
406
+ )
407
+ (2): Sequential(
408
+ (0): LeakyReLU(negative_slope=0.1)
409
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
410
+ )
411
+ )
412
+ )
413
+ (4): ResidualBlock(
414
+ (convs1): ModuleList(
415
+ (0): Sequential(
416
+ (0): LeakyReLU(negative_slope=0.1)
417
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
418
+ )
419
+ (1): Sequential(
420
+ (0): LeakyReLU(negative_slope=0.1)
421
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
422
+ )
423
+ (2): Sequential(
424
+ (0): LeakyReLU(negative_slope=0.1)
425
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
426
+ )
427
+ )
428
+ (convs2): ModuleList(
429
+ (0): Sequential(
430
+ (0): LeakyReLU(negative_slope=0.1)
431
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
432
+ )
433
+ (1): Sequential(
434
+ (0): LeakyReLU(negative_slope=0.1)
435
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
436
+ )
437
+ (2): Sequential(
438
+ (0): LeakyReLU(negative_slope=0.1)
439
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
440
+ )
441
+ )
442
+ )
443
+ (5): ResidualBlock(
444
+ (convs1): ModuleList(
445
+ (0): Sequential(
446
+ (0): LeakyReLU(negative_slope=0.1)
447
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
448
+ )
449
+ (1): Sequential(
450
+ (0): LeakyReLU(negative_slope=0.1)
451
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
452
+ )
453
+ (2): Sequential(
454
+ (0): LeakyReLU(negative_slope=0.1)
455
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
456
+ )
457
+ )
458
+ (convs2): ModuleList(
459
+ (0): Sequential(
460
+ (0): LeakyReLU(negative_slope=0.1)
461
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
462
+ )
463
+ (1): Sequential(
464
+ (0): LeakyReLU(negative_slope=0.1)
465
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
466
+ )
467
+ (2): Sequential(
468
+ (0): LeakyReLU(negative_slope=0.1)
469
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
470
+ )
471
+ )
472
+ )
473
+ (6): ResidualBlock(
474
+ (convs1): ModuleList(
475
+ (0): Sequential(
476
+ (0): LeakyReLU(negative_slope=0.1)
477
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
478
+ )
479
+ (1): Sequential(
480
+ (0): LeakyReLU(negative_slope=0.1)
481
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
482
+ )
483
+ (2): Sequential(
484
+ (0): LeakyReLU(negative_slope=0.1)
485
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
486
+ )
487
+ )
488
+ (convs2): ModuleList(
489
+ (0): Sequential(
490
+ (0): LeakyReLU(negative_slope=0.1)
491
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
492
+ )
493
+ (1): Sequential(
494
+ (0): LeakyReLU(negative_slope=0.1)
495
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
496
+ )
497
+ (2): Sequential(
498
+ (0): LeakyReLU(negative_slope=0.1)
499
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
500
+ )
501
+ )
502
+ )
503
+ (7): ResidualBlock(
504
+ (convs1): ModuleList(
505
+ (0): Sequential(
506
+ (0): LeakyReLU(negative_slope=0.1)
507
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
508
+ )
509
+ (1): Sequential(
510
+ (0): LeakyReLU(negative_slope=0.1)
511
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
512
+ )
513
+ (2): Sequential(
514
+ (0): LeakyReLU(negative_slope=0.1)
515
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
516
+ )
517
+ )
518
+ (convs2): ModuleList(
519
+ (0): Sequential(
520
+ (0): LeakyReLU(negative_slope=0.1)
521
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
522
+ )
523
+ (1): Sequential(
524
+ (0): LeakyReLU(negative_slope=0.1)
525
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
526
+ )
527
+ (2): Sequential(
528
+ (0): LeakyReLU(negative_slope=0.1)
529
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
530
+ )
531
+ )
532
+ )
533
+ (8): ResidualBlock(
534
+ (convs1): ModuleList(
535
+ (0): Sequential(
536
+ (0): LeakyReLU(negative_slope=0.1)
537
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
538
+ )
539
+ (1): Sequential(
540
+ (0): LeakyReLU(negative_slope=0.1)
541
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
542
+ )
543
+ (2): Sequential(
544
+ (0): LeakyReLU(negative_slope=0.1)
545
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
546
+ )
547
+ )
548
+ (convs2): ModuleList(
549
+ (0): Sequential(
550
+ (0): LeakyReLU(negative_slope=0.1)
551
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
552
+ )
553
+ (1): Sequential(
554
+ (0): LeakyReLU(negative_slope=0.1)
555
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
556
+ )
557
+ (2): Sequential(
558
+ (0): LeakyReLU(negative_slope=0.1)
559
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
560
+ )
561
+ )
562
+ )
563
+ (9): ResidualBlock(
564
+ (convs1): ModuleList(
565
+ (0): Sequential(
566
+ (0): LeakyReLU(negative_slope=0.1)
567
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
568
+ )
569
+ (1): Sequential(
570
+ (0): LeakyReLU(negative_slope=0.1)
571
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
572
+ )
573
+ (2): Sequential(
574
+ (0): LeakyReLU(negative_slope=0.1)
575
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
576
+ )
577
+ )
578
+ (convs2): ModuleList(
579
+ (0): Sequential(
580
+ (0): LeakyReLU(negative_slope=0.1)
581
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
582
+ )
583
+ (1): Sequential(
584
+ (0): LeakyReLU(negative_slope=0.1)
585
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
586
+ )
587
+ (2): Sequential(
588
+ (0): LeakyReLU(negative_slope=0.1)
589
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
590
+ )
591
+ )
592
+ )
593
+ (10): ResidualBlock(
594
+ (convs1): ModuleList(
595
+ (0): Sequential(
596
+ (0): LeakyReLU(negative_slope=0.1)
597
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
598
+ )
599
+ (1): Sequential(
600
+ (0): LeakyReLU(negative_slope=0.1)
601
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
602
+ )
603
+ (2): Sequential(
604
+ (0): LeakyReLU(negative_slope=0.1)
605
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
606
+ )
607
+ )
608
+ (convs2): ModuleList(
609
+ (0): Sequential(
610
+ (0): LeakyReLU(negative_slope=0.1)
611
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
612
+ )
613
+ (1): Sequential(
614
+ (0): LeakyReLU(negative_slope=0.1)
615
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
616
+ )
617
+ (2): Sequential(
618
+ (0): LeakyReLU(negative_slope=0.1)
619
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
620
+ )
621
+ )
622
+ )
623
+ (11): ResidualBlock(
624
+ (convs1): ModuleList(
625
+ (0): Sequential(
626
+ (0): LeakyReLU(negative_slope=0.1)
627
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
628
+ )
629
+ (1): Sequential(
630
+ (0): LeakyReLU(negative_slope=0.1)
631
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
632
+ )
633
+ (2): Sequential(
634
+ (0): LeakyReLU(negative_slope=0.1)
635
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
636
+ )
637
+ )
638
+ (convs2): ModuleList(
639
+ (0): Sequential(
640
+ (0): LeakyReLU(negative_slope=0.1)
641
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
642
+ )
643
+ (1): Sequential(
644
+ (0): LeakyReLU(negative_slope=0.1)
645
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
646
+ )
647
+ (2): Sequential(
648
+ (0): LeakyReLU(negative_slope=0.1)
649
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
650
+ )
651
+ )
652
+ )
653
+ )
654
+ (output_conv): Sequential(
655
+ (0): LeakyReLU(negative_slope=0.01)
656
+ (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
657
+ (2): Tanh()
658
+ )
659
+ )
660
+ )
661
+ (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
662
+ (msd): HiFiGANMultiScaleDiscriminator(
663
+ (discriminators): ModuleList(
664
+ (0): HiFiGANScaleDiscriminator(
665
+ (layers): ModuleList(
666
+ (0): Sequential(
667
+ (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
668
+ (1): LeakyReLU(negative_slope=0.1)
669
+ )
670
+ (1): Sequential(
671
+ (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
672
+ (1): LeakyReLU(negative_slope=0.1)
673
+ )
674
+ (2): Sequential(
675
+ (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
676
+ (1): LeakyReLU(negative_slope=0.1)
677
+ )
678
+ (3): Sequential(
679
+ (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
680
+ (1): LeakyReLU(negative_slope=0.1)
681
+ )
682
+ (4): Sequential(
683
+ (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
684
+ (1): LeakyReLU(negative_slope=0.1)
685
+ )
686
+ (5): Sequential(
687
+ (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
688
+ (1): LeakyReLU(negative_slope=0.1)
689
+ )
690
+ (6): Sequential(
691
+ (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
692
+ (1): LeakyReLU(negative_slope=0.1)
693
+ )
694
+ (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
695
+ )
696
+ )
697
+ )
698
+ )
699
+ (mpd): HiFiGANMultiPeriodDiscriminator(
700
+ (discriminators): ModuleList(
701
+ (0): HiFiGANPeriodDiscriminator(
702
+ (convs): ModuleList(
703
+ (0): Sequential(
704
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
705
+ (1): LeakyReLU(negative_slope=0.1)
706
+ )
707
+ (1): Sequential(
708
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
709
+ (1): LeakyReLU(negative_slope=0.1)
710
+ )
711
+ (2): Sequential(
712
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
713
+ (1): LeakyReLU(negative_slope=0.1)
714
+ )
715
+ (3): Sequential(
716
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
717
+ (1): LeakyReLU(negative_slope=0.1)
718
+ )
719
+ (4): Sequential(
720
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
721
+ (1): LeakyReLU(negative_slope=0.1)
722
+ )
723
+ )
724
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
725
+ )
726
+ (1): HiFiGANPeriodDiscriminator(
727
+ (convs): ModuleList(
728
+ (0): Sequential(
729
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
730
+ (1): LeakyReLU(negative_slope=0.1)
731
+ )
732
+ (1): Sequential(
733
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
734
+ (1): LeakyReLU(negative_slope=0.1)
735
+ )
736
+ (2): Sequential(
737
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
738
+ (1): LeakyReLU(negative_slope=0.1)
739
+ )
740
+ (3): Sequential(
741
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
742
+ (1): LeakyReLU(negative_slope=0.1)
743
+ )
744
+ (4): Sequential(
745
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
746
+ (1): LeakyReLU(negative_slope=0.1)
747
+ )
748
+ )
749
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
750
+ )
751
+ (2): HiFiGANPeriodDiscriminator(
752
+ (convs): ModuleList(
753
+ (0): Sequential(
754
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
755
+ (1): LeakyReLU(negative_slope=0.1)
756
+ )
757
+ (1): Sequential(
758
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
759
+ (1): LeakyReLU(negative_slope=0.1)
760
+ )
761
+ (2): Sequential(
762
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
763
+ (1): LeakyReLU(negative_slope=0.1)
764
+ )
765
+ (3): Sequential(
766
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
767
+ (1): LeakyReLU(negative_slope=0.1)
768
+ )
769
+ (4): Sequential(
770
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
771
+ (1): LeakyReLU(negative_slope=0.1)
772
+ )
773
+ )
774
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
775
+ )
776
+ (3): HiFiGANPeriodDiscriminator(
777
+ (convs): ModuleList(
778
+ (0): Sequential(
779
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
780
+ (1): LeakyReLU(negative_slope=0.1)
781
+ )
782
+ (1): Sequential(
783
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
784
+ (1): LeakyReLU(negative_slope=0.1)
785
+ )
786
+ (2): Sequential(
787
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
788
+ (1): LeakyReLU(negative_slope=0.1)
789
+ )
790
+ (3): Sequential(
791
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
792
+ (1): LeakyReLU(negative_slope=0.1)
793
+ )
794
+ (4): Sequential(
795
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
796
+ (1): LeakyReLU(negative_slope=0.1)
797
+ )
798
+ )
799
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
800
+ )
801
+ (4): HiFiGANPeriodDiscriminator(
802
+ (convs): ModuleList(
803
+ (0): Sequential(
804
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
805
+ (1): LeakyReLU(negative_slope=0.1)
806
+ )
807
+ (1): Sequential(
808
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
809
+ (1): LeakyReLU(negative_slope=0.1)
810
+ )
811
+ (2): Sequential(
812
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
813
+ (1): LeakyReLU(negative_slope=0.1)
814
+ )
815
+ (3): Sequential(
816
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
817
+ (1): LeakyReLU(negative_slope=0.1)
818
+ )
819
+ (4): Sequential(
820
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
821
+ (1): LeakyReLU(negative_slope=0.1)
822
+ )
823
+ )
824
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
825
+ )
826
+ )
827
+ )
828
+ )
829
+ (generator_adv_loss): GeneratorAdversarialLoss()
830
+ (discriminator_adv_loss): DiscriminatorAdversarialLoss()
831
+ (feat_match_loss): FeatureMatchLoss()
832
+ (mel_loss): MelSpectrogramLoss(
833
+ (wav_to_mel): LogMelFbank(
834
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
835
+ (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False)
836
+ )
837
+ )
838
+ (var_loss): VarianceLoss(
839
+ (mse_criterion): MSELoss()
840
+ (duration_criterion): DurationPredictorLoss(
841
+ (criterion): MSELoss()
842
+ )
843
+ )
844
+ (forwardsum_loss): ForwardSumLoss()
845
+ )
846
+ )
847
+
848
+ Model summary:
849
+ Class Name: ESPnetGANTTSModel
850
+ Total Number of model parameters: 83.28 M
851
+ Number of trainable parameters: 83.28 M (100.0%)
852
+ Size: 333.11 MB
853
+ Type: torch.float32
854
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,870 (abs_task:1161) INFO: Optimizer:
855
+ AdamW (
856
+ Parameter Group 0
857
+ amsgrad: False
858
+ betas: [0.8, 0.99]
859
+ eps: 1e-09
860
+ initial_lr: 0.0002
861
+ lr: 0.0002
862
+ weight_decay: 0.0
863
+ )
864
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,870 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f3cb00fb550>
865
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,870 (abs_task:1161) INFO: Optimizer2:
866
+ AdamW (
867
+ Parameter Group 0
868
+ amsgrad: False
869
+ betas: [0.8, 0.99]
870
+ eps: 1e-09
871
+ initial_lr: 0.0002
872
+ lr: 0.0002
873
+ weight_decay: 0.0
874
+ )
875
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,870 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f3dab0d89d0>
876
+ [92b100c97f43:0/4] 2025-03-04 21:30:47,871 (abs_task:1171) INFO: Saving the configuration in exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml
877
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,205 (abs_task:1525) INFO: [train] dataset:
878
+ ESPnetDataset(
879
+ text: {"path": "dump/raw/jvs010_tr_no_dev/text", "type": "text"}
880
+ speech: {"path": "dump/raw/jvs010_tr_no_dev/wav.scp", "type": "sound"}
881
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f3cb00fbdc0>)
882
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,205 (abs_task:1526) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=3, batch_bins=9000000, sort_in_batch=descending, sort_batch=descending)
883
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,205 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=3, mean=33.3, min=6, max=53
884
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,225 (abs_task:1525) INFO: [valid] dataset:
885
+ ESPnetDataset(
886
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
887
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
888
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f3cb00fb520>)
889
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,225 (abs_task:1526) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=1, batch_bins=9000000, sort_in_batch=descending, sort_batch=descending)
890
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,225 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=1, mean=15.0, min=15, max=15
891
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,244 (abs_task:1525) INFO: [plot_att] dataset:
892
+ ESPnetDataset(
893
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
894
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
895
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f3cb00b7130>)
896
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,244 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15, batch_size=1, key_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn,
897
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,244 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1
898
+ 92b100c97f43:1179446:1179446 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
899
+ 92b100c97f43:1179446:1179446 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
900
+
901
+ 92b100c97f43:1179446:1179446 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
902
+ 92b100c97f43:1179446:1179446 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
903
+ 92b100c97f43:1179446:1179446 [0] NCCL INFO Using network Socket
904
+ NCCL version 2.10.3+cuda11.3
905
+ 92b100c97f43:1179447:1179447 [1] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
906
+ 92b100c97f43:1179449:1179449 [3] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
907
+ 92b100c97f43:1179447:1179447 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
908
+ 92b100c97f43:1179449:1179449 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
909
+
910
+ 92b100c97f43:1179447:1179447 [1] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
911
+
912
+ 92b100c97f43:1179449:1179449 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
913
+ 92b100c97f43:1179447:1179447 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
914
+ 92b100c97f43:1179449:1179449 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
915
+ 92b100c97f43:1179447:1179447 [1] NCCL INFO Using network Socket
916
+ 92b100c97f43:1179449:1179449 [3] NCCL INFO Using network Socket
917
+ 92b100c97f43:1179448:1179448 [2] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
918
+ 92b100c97f43:1179448:1179448 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
919
+
920
+ 92b100c97f43:1179448:1179448 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
921
+ 92b100c97f43:1179448:1179448 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
922
+ 92b100c97f43:1179448:1179448 [2] NCCL INFO Using network Socket
923
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Channel 00/02 : 0 1 2 3
924
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
925
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
926
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
927
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Channel 01/02 : 0 1 2 3
928
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff
929
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff
930
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
931
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff
932
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff
933
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Channel 00 : 3[60] -> 0[30] via direct shared memory
934
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Channel 00 : 2[50] -> 3[60] via direct shared memory
935
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Channel 01 : 3[60] -> 0[30] via direct shared memory
936
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Channel 01 : 2[50] -> 3[60] via direct shared memory
937
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Channel 00 : 1[40] -> 2[50] via direct shared memory
938
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Channel 00 : 0[30] -> 1[40] via direct shared memory
939
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Channel 01 : 0[30] -> 1[40] via direct shared memory
940
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Channel 01 : 1[40] -> 2[50] via direct shared memory
941
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Connected all rings
942
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Connected all rings
943
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Connected all rings
944
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Channel 00 : 2[50] -> 1[40] via direct shared memory
945
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Channel 01 : 2[50] -> 1[40] via direct shared memory
946
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Connected all rings
947
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Channel 00 : 3[60] -> 2[50] via direct shared memory
948
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Channel 01 : 3[60] -> 2[50] via direct shared memory
949
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO Connected all trees
950
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
951
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
952
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Channel 00 : 1[40] -> 0[30] via direct shared memory
953
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Channel 01 : 1[40] -> 0[30] via direct shared memory
954
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO Connected all trees
955
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
956
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
957
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO Connected all trees
958
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO Connected all trees
959
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
960
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
961
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
962
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
963
+ 92b100c97f43:1179449:1179486 [3] NCCL INFO comm 0x7f41ac0030d0 rank 3 nranks 4 cudaDev 3 busId 60 - Init COMPLETE
964
+ 92b100c97f43:1179448:1179487 [2] NCCL INFO comm 0x7fc9b80030d0 rank 2 nranks 4 cudaDev 2 busId 50 - Init COMPLETE
965
+ 92b100c97f43:1179447:1179485 [1] NCCL INFO comm 0x7fe4540030d0 rank 1 nranks 4 cudaDev 1 busId 40 - Init COMPLETE
966
+ 92b100c97f43:1179446:1179484 [0] NCCL INFO comm 0x7f3bc40030d0 rank 0 nranks 4 cudaDev 0 busId 30 - Init COMPLETE
967
+ 92b100c97f43:1179446:1179446 [0] NCCL INFO Launch mode Parallel
968
+ [92b100c97f43:0/4] 2025-03-04 21:30:48,689 (trainer:280) INFO: 1/130epoch started
969
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
970
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
971
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
972
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
973
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
974
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
975
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
976
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
977
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
978
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
979
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
980
+ olens = (ilens - self.n_fft) // self.hop_length + 1
981
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
982
+ olens = (ilens - self.n_fft) // self.hop_length + 1
983
+ [92b100c97f43:0/4] 2025-03-04 21:32:50,772 (gan_trainer:305) INFO: 1epoch:train:1-50batch: iter_time=0.045, generator_forward_time=0.898, generator_loss=140.208, generator_g_loss=110.977, generator_var_loss=5.358, generator_align_loss=23.874, generator_g_mel_loss=107.112, generator_g_adv_loss=2.215, generator_g_feat_match_loss=1.649, generator_var_dur_loss=0.605, generator_var_pitch_loss=2.439, generator_var_energy_loss=2.314, generator_align_forwardsum_loss=10.578, generator_align_bin_loss=1.359, generator_backward_time=0.317, generator_optim_step_time=0.033, optim0_lr0=2.000e-04, generator_train_time=1.350, discriminator_forward_time=0.690, discriminator_loss=2.784, discriminator_real_loss=1.570, discriminator_fake_loss=1.214, discriminator_backward_time=0.228, discriminator_optim_step_time=0.008, optim1_lr0=2.000e-04, discriminator_train_time=0.965, train_time=2.439
984
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
985
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
986
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
987
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
988
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
989
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
990
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
991
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
992
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
993
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
994
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
995
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
996
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
997
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
998
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
999
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1000
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1001
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1002
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1003
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1004
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1005
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1006
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1007
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1008
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1009
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1010
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1011
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1012
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1013
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1014
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1015
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1016
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1017
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1018
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1019
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1020
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1021
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1022
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1023
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1024
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1025
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1026
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1027
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1028
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1029
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1030
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1031
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1032
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1033
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1034
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1035
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1036
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1037
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1038
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1039
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1040
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1041
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1042
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1043
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1044
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1045
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1046
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1047
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1048
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1049
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1050
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1051
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1052
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1053
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1054
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1055
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1056
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1057
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1058
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1059
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1060
+ Traceback (most recent call last):
1061
+ File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
1062
+ return _run_code(code, main_globals, None,
1063
+ File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
1064
+ exec(code, run_globals)
1065
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 22, in <module>
1066
+ main()
1067
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 18, in main
1068
+ GANTTSTask.main(cmd=cmd)
1069
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1069, in main
1070
+ while not ProcessContext(processes, error_queues).join():
1071
+ File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 99, in join
1072
+ ready = multiprocessing.connection.wait(
1073
+ File "/usr/lib/python3.8/multiprocessing/connection.py", line 931, in wait
1074
+ ready = selector.select(timeout)
1075
+ File "/usr/lib/python3.8/selectors.py", line 415, in select
1076
+ fd_event_list = self._selector.poll(timeout)
1077
+ KeyboardInterrupt
1078
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1079
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1080
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1081
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1082
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1083
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1084
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1085
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1086
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1087
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1088
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1089
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1090
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1091
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1092
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1093
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1094
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1095
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1096
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1097
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1098
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1099
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1100
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1101
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1102
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1103
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1104
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1105
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1106
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1107
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1108
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1109
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1110
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1111
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1112
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1113
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1114
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1115
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1116
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1117
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1118
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1119
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1120
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1121
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1122
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1123
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1124
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1125
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1126
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1127
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1128
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1129
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1130
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1131
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1132
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1133
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1134
+ Process SpawnProcess-2:
1135
+ Traceback (most recent call last):
1136
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1137
+ self.run()
1138
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1139
+ self._target(*self._args, **self._kwargs)
1140
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1141
+ cls.trainer.run(
1142
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1143
+ all_steps_are_invalid = cls.train_one_epoch(
1144
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1145
+ stats, weight = recursive_average(stats, weight, distributed)
1146
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average
1147
+ obj = recursive_sum(obj, weight, distributed)
1148
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum
1149
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1150
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in <dictcomp>
1151
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1152
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum
1153
+ torch.distributed.all_reduce(obj, op=ReduceOp.SUM)
1154
+ File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1292, in all_reduce
1155
+ work.wait()
1156
+ KeyboardInterrupt
1157
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1158
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1159
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1160
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1161
+ Process SpawnProcess-4:
1162
+ Traceback (most recent call last):
1163
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1164
+ self.run()
1165
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1166
+ self._target(*self._args, **self._kwargs)
1167
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1168
+ cls.trainer.run(
1169
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1170
+ all_steps_are_invalid = cls.train_one_epoch(
1171
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1172
+ stats, weight = recursive_average(stats, weight, distributed)
1173
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average
1174
+ obj = recursive_sum(obj, weight, distributed)
1175
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum
1176
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1177
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in <dictcomp>
1178
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1179
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum
1180
+ torch.distributed.all_reduce(obj, op=ReduceOp.SUM)
1181
+ File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1292, in all_reduce
1182
+ work.wait()
1183
+ KeyboardInterrupt
1184
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1185
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1186
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1187
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1188
+ Process SpawnProcess-3:
1189
+ Traceback (most recent call last):
1190
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1191
+ stats, weight = recursive_average(stats, weight, distributed)
1192
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average
1193
+ obj = recursive_sum(obj, weight, distributed)
1194
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum
1195
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1196
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in <dictcomp>
1197
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1198
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum
1199
+ torch.distributed.all_reduce(obj, op=ReduceOp.SUM)
1200
+ File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1292, in all_reduce
1201
+ work.wait()
1202
+ RuntimeError: [Rank 2] Caught collective operation timeout: WorkNCCL(OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800396 milliseconds before timing out.
1203
+
1204
+ During handling of the above exception, another exception occurred:
1205
+
1206
+ Traceback (most recent call last):
1207
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1208
+ self.run()
1209
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1210
+ self._target(*self._args, **self._kwargs)
1211
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1212
+ cls.trainer.run(
1213
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1214
+ all_steps_are_invalid = cls.train_one_epoch(
1215
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1216
+ stats, weight = recursive_average(stats, weight, distributed)
1217
+ KeyboardInterrupt
1218
+ Process SpawnProcess-1:
1219
+ Traceback (most recent call last):
1220
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1221
+ stats, weight = recursive_average(stats, weight, distributed)
1222
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average
1223
+ obj = recursive_sum(obj, weight, distributed)
1224
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum
1225
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1226
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 13, in <dictcomp>
1227
+ return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()}
1228
+ File "/work/espnet/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum
1229
+ torch.distributed.all_reduce(obj, op=ReduceOp.SUM)
1230
+ File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1292, in all_reduce
1231
+ work.wait()
1232
+ RuntimeError: [Rank 0] Caught collective operation timeout: WorkNCCL(OpType=ALLREDUCE, TensorShape=[], Timeout(ms)=1800000) ran for 1800456 milliseconds before timing out.
1233
+
1234
+ During handling of the above exception, another exception occurred:
1235
+
1236
+ Traceback (most recent call last):
1237
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1238
+ self.run()
1239
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1240
+ self._target(*self._args, **self._kwargs)
1241
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1242
+ cls.trainer.run(
1243
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1244
+ all_steps_are_invalid = cls.train_one_epoch(
1245
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 202, in train_one_epoch
1246
+ stats, weight = recursive_average(stats, weight, distributed)
1247
+ KeyboardInterrupt
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.4.log ADDED
@@ -0,0 +1,1212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
2
+ # Started at Tue Mar 4 21:23:38 JST 2025
3
+ #
4
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
+ /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
7
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
8
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
9
+ [92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
10
+ [92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
11
+ [92b100c97f43:0/4] 2025-03-04 21:23:44,896 (gan_tts:304) INFO: Vocabulary size: 41
12
+ [92b100c97f43:0/4] 2025-03-04 21:23:45,027 (encoder:172) INFO: encoder self-attention layer type = self-attention
13
+ [92b100c97f43:0/4] 2025-03-04 21:23:45,249 (encoder:172) INFO: encoder self-attention layer type = self-attention
14
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,351 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
15
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1158) INFO: Model structure:
16
+ ESPnetGANTTSModel(
17
+ (feats_extract): LogMelFbank(
18
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
19
+ (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False)
20
+ )
21
+ (normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz, norm_means=True, norm_vars=True)
22
+ (pitch_extract): Dio()
23
+ (pitch_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz, norm_means=True, norm_vars=True)
24
+ (energy_extract): Energy(
25
+ (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
26
+ )
27
+ (energy_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz, norm_means=True, norm_vars=True)
28
+ (tts): JETS(
29
+ (generator): JETSGenerator(
30
+ (encoder): Encoder(
31
+ (embed): Sequential(
32
+ (0): Embedding(41, 256, padding_idx=0)
33
+ (1): ScaledPositionalEncoding(
34
+ (dropout): Dropout(p=0.2, inplace=False)
35
+ )
36
+ )
37
+ (encoders): MultiSequential(
38
+ (0): EncoderLayer(
39
+ (self_attn): MultiHeadedAttention(
40
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
41
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
42
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
43
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
44
+ (dropout): Dropout(p=0.2, inplace=False)
45
+ )
46
+ (feed_forward): MultiLayeredConv1d(
47
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
48
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
49
+ (dropout): Dropout(p=0.2, inplace=False)
50
+ )
51
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
52
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
53
+ (dropout): Dropout(p=0.2, inplace=False)
54
+ )
55
+ (1): EncoderLayer(
56
+ (self_attn): MultiHeadedAttention(
57
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
58
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
59
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
60
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
61
+ (dropout): Dropout(p=0.2, inplace=False)
62
+ )
63
+ (feed_forward): MultiLayeredConv1d(
64
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
65
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
66
+ (dropout): Dropout(p=0.2, inplace=False)
67
+ )
68
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
69
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
70
+ (dropout): Dropout(p=0.2, inplace=False)
71
+ )
72
+ (2): EncoderLayer(
73
+ (self_attn): MultiHeadedAttention(
74
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
75
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
76
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
77
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
78
+ (dropout): Dropout(p=0.2, inplace=False)
79
+ )
80
+ (feed_forward): MultiLayeredConv1d(
81
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
82
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
83
+ (dropout): Dropout(p=0.2, inplace=False)
84
+ )
85
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
86
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
87
+ (dropout): Dropout(p=0.2, inplace=False)
88
+ )
89
+ (3): EncoderLayer(
90
+ (self_attn): MultiHeadedAttention(
91
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
92
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
93
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
94
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
95
+ (dropout): Dropout(p=0.2, inplace=False)
96
+ )
97
+ (feed_forward): MultiLayeredConv1d(
98
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
99
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
100
+ (dropout): Dropout(p=0.2, inplace=False)
101
+ )
102
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
103
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
104
+ (dropout): Dropout(p=0.2, inplace=False)
105
+ )
106
+ )
107
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
108
+ )
109
+ (duration_predictor): DurationPredictor(
110
+ (conv): ModuleList(
111
+ (0): Sequential(
112
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
113
+ (1): ReLU()
114
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
115
+ (3): Dropout(p=0.1, inplace=False)
116
+ )
117
+ (1): Sequential(
118
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
119
+ (1): ReLU()
120
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
121
+ (3): Dropout(p=0.1, inplace=False)
122
+ )
123
+ )
124
+ (linear): Linear(in_features=256, out_features=1, bias=True)
125
+ )
126
+ (pitch_predictor): VariancePredictor(
127
+ (conv): ModuleList(
128
+ (0): Sequential(
129
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
130
+ (1): ReLU()
131
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
132
+ (3): Dropout(p=0.5, inplace=False)
133
+ )
134
+ (1): Sequential(
135
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
136
+ (1): ReLU()
137
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
138
+ (3): Dropout(p=0.5, inplace=False)
139
+ )
140
+ (2): Sequential(
141
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
142
+ (1): ReLU()
143
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
144
+ (3): Dropout(p=0.5, inplace=False)
145
+ )
146
+ (3): Sequential(
147
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
148
+ (1): ReLU()
149
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
150
+ (3): Dropout(p=0.5, inplace=False)
151
+ )
152
+ (4): Sequential(
153
+ (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
154
+ (1): ReLU()
155
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
156
+ (3): Dropout(p=0.5, inplace=False)
157
+ )
158
+ )
159
+ (linear): Linear(in_features=256, out_features=1, bias=True)
160
+ )
161
+ (pitch_embed): Sequential(
162
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
163
+ (1): Dropout(p=0.0, inplace=False)
164
+ )
165
+ (energy_predictor): VariancePredictor(
166
+ (conv): ModuleList(
167
+ (0): Sequential(
168
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
169
+ (1): ReLU()
170
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
171
+ (3): Dropout(p=0.5, inplace=False)
172
+ )
173
+ (1): Sequential(
174
+ (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
175
+ (1): ReLU()
176
+ (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
177
+ (3): Dropout(p=0.5, inplace=False)
178
+ )
179
+ )
180
+ (linear): Linear(in_features=256, out_features=1, bias=True)
181
+ )
182
+ (energy_embed): Sequential(
183
+ (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
184
+ (1): Dropout(p=0.0, inplace=False)
185
+ )
186
+ (alignment_module): AlignmentModule(
187
+ (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
188
+ (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
189
+ (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,))
190
+ (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
191
+ (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
192
+ )
193
+ (length_regulator): GaussianUpsampling()
194
+ (decoder): Encoder(
195
+ (embed): Sequential(
196
+ (0): ScaledPositionalEncoding(
197
+ (dropout): Dropout(p=0.2, inplace=False)
198
+ )
199
+ )
200
+ (encoders): MultiSequential(
201
+ (0): EncoderLayer(
202
+ (self_attn): MultiHeadedAttention(
203
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
204
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
205
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
206
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
207
+ (dropout): Dropout(p=0.2, inplace=False)
208
+ )
209
+ (feed_forward): MultiLayeredConv1d(
210
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
211
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
212
+ (dropout): Dropout(p=0.2, inplace=False)
213
+ )
214
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
215
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
216
+ (dropout): Dropout(p=0.2, inplace=False)
217
+ )
218
+ (1): EncoderLayer(
219
+ (self_attn): MultiHeadedAttention(
220
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
221
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
222
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
223
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
224
+ (dropout): Dropout(p=0.2, inplace=False)
225
+ )
226
+ (feed_forward): MultiLayeredConv1d(
227
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
228
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
229
+ (dropout): Dropout(p=0.2, inplace=False)
230
+ )
231
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
232
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
233
+ (dropout): Dropout(p=0.2, inplace=False)
234
+ )
235
+ (2): EncoderLayer(
236
+ (self_attn): MultiHeadedAttention(
237
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
238
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
239
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
240
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
241
+ (dropout): Dropout(p=0.2, inplace=False)
242
+ )
243
+ (feed_forward): MultiLayeredConv1d(
244
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
245
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
246
+ (dropout): Dropout(p=0.2, inplace=False)
247
+ )
248
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
249
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
250
+ (dropout): Dropout(p=0.2, inplace=False)
251
+ )
252
+ (3): EncoderLayer(
253
+ (self_attn): MultiHeadedAttention(
254
+ (linear_q): Linear(in_features=256, out_features=256, bias=True)
255
+ (linear_k): Linear(in_features=256, out_features=256, bias=True)
256
+ (linear_v): Linear(in_features=256, out_features=256, bias=True)
257
+ (linear_out): Linear(in_features=256, out_features=256, bias=True)
258
+ (dropout): Dropout(p=0.2, inplace=False)
259
+ )
260
+ (feed_forward): MultiLayeredConv1d(
261
+ (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
262
+ (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
263
+ (dropout): Dropout(p=0.2, inplace=False)
264
+ )
265
+ (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
266
+ (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
267
+ (dropout): Dropout(p=0.2, inplace=False)
268
+ )
269
+ )
270
+ (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
271
+ )
272
+ (generator): HiFiGANGenerator(
273
+ (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,))
274
+ (upsamples): ModuleList(
275
+ (0): Sequential(
276
+ (0): LeakyReLU(negative_slope=0.1)
277
+ (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
278
+ )
279
+ (1): Sequential(
280
+ (0): LeakyReLU(negative_slope=0.1)
281
+ (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
282
+ )
283
+ (2): Sequential(
284
+ (0): LeakyReLU(negative_slope=0.1)
285
+ (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
286
+ )
287
+ (3): Sequential(
288
+ (0): LeakyReLU(negative_slope=0.1)
289
+ (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
290
+ )
291
+ )
292
+ (blocks): ModuleList(
293
+ (0): ResidualBlock(
294
+ (convs1): ModuleList(
295
+ (0): Sequential(
296
+ (0): LeakyReLU(negative_slope=0.1)
297
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
298
+ )
299
+ (1): Sequential(
300
+ (0): LeakyReLU(negative_slope=0.1)
301
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
302
+ )
303
+ (2): Sequential(
304
+ (0): LeakyReLU(negative_slope=0.1)
305
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
306
+ )
307
+ )
308
+ (convs2): ModuleList(
309
+ (0): Sequential(
310
+ (0): LeakyReLU(negative_slope=0.1)
311
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
312
+ )
313
+ (1): Sequential(
314
+ (0): LeakyReLU(negative_slope=0.1)
315
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
316
+ )
317
+ (2): Sequential(
318
+ (0): LeakyReLU(negative_slope=0.1)
319
+ (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
320
+ )
321
+ )
322
+ )
323
+ (1): ResidualBlock(
324
+ (convs1): ModuleList(
325
+ (0): Sequential(
326
+ (0): LeakyReLU(negative_slope=0.1)
327
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
328
+ )
329
+ (1): Sequential(
330
+ (0): LeakyReLU(negative_slope=0.1)
331
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
332
+ )
333
+ (2): Sequential(
334
+ (0): LeakyReLU(negative_slope=0.1)
335
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
336
+ )
337
+ )
338
+ (convs2): ModuleList(
339
+ (0): Sequential(
340
+ (0): LeakyReLU(negative_slope=0.1)
341
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
342
+ )
343
+ (1): Sequential(
344
+ (0): LeakyReLU(negative_slope=0.1)
345
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
346
+ )
347
+ (2): Sequential(
348
+ (0): LeakyReLU(negative_slope=0.1)
349
+ (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
350
+ )
351
+ )
352
+ )
353
+ (2): ResidualBlock(
354
+ (convs1): ModuleList(
355
+ (0): Sequential(
356
+ (0): LeakyReLU(negative_slope=0.1)
357
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
358
+ )
359
+ (1): Sequential(
360
+ (0): LeakyReLU(negative_slope=0.1)
361
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
362
+ )
363
+ (2): Sequential(
364
+ (0): LeakyReLU(negative_slope=0.1)
365
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
366
+ )
367
+ )
368
+ (convs2): ModuleList(
369
+ (0): Sequential(
370
+ (0): LeakyReLU(negative_slope=0.1)
371
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
372
+ )
373
+ (1): Sequential(
374
+ (0): LeakyReLU(negative_slope=0.1)
375
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
376
+ )
377
+ (2): Sequential(
378
+ (0): LeakyReLU(negative_slope=0.1)
379
+ (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
380
+ )
381
+ )
382
+ )
383
+ (3): ResidualBlock(
384
+ (convs1): ModuleList(
385
+ (0): Sequential(
386
+ (0): LeakyReLU(negative_slope=0.1)
387
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
388
+ )
389
+ (1): Sequential(
390
+ (0): LeakyReLU(negative_slope=0.1)
391
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
392
+ )
393
+ (2): Sequential(
394
+ (0): LeakyReLU(negative_slope=0.1)
395
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
396
+ )
397
+ )
398
+ (convs2): ModuleList(
399
+ (0): Sequential(
400
+ (0): LeakyReLU(negative_slope=0.1)
401
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
402
+ )
403
+ (1): Sequential(
404
+ (0): LeakyReLU(negative_slope=0.1)
405
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
406
+ )
407
+ (2): Sequential(
408
+ (0): LeakyReLU(negative_slope=0.1)
409
+ (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
410
+ )
411
+ )
412
+ )
413
+ (4): ResidualBlock(
414
+ (convs1): ModuleList(
415
+ (0): Sequential(
416
+ (0): LeakyReLU(negative_slope=0.1)
417
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
418
+ )
419
+ (1): Sequential(
420
+ (0): LeakyReLU(negative_slope=0.1)
421
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
422
+ )
423
+ (2): Sequential(
424
+ (0): LeakyReLU(negative_slope=0.1)
425
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
426
+ )
427
+ )
428
+ (convs2): ModuleList(
429
+ (0): Sequential(
430
+ (0): LeakyReLU(negative_slope=0.1)
431
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
432
+ )
433
+ (1): Sequential(
434
+ (0): LeakyReLU(negative_slope=0.1)
435
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
436
+ )
437
+ (2): Sequential(
438
+ (0): LeakyReLU(negative_slope=0.1)
439
+ (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
440
+ )
441
+ )
442
+ )
443
+ (5): ResidualBlock(
444
+ (convs1): ModuleList(
445
+ (0): Sequential(
446
+ (0): LeakyReLU(negative_slope=0.1)
447
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
448
+ )
449
+ (1): Sequential(
450
+ (0): LeakyReLU(negative_slope=0.1)
451
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
452
+ )
453
+ (2): Sequential(
454
+ (0): LeakyReLU(negative_slope=0.1)
455
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
456
+ )
457
+ )
458
+ (convs2): ModuleList(
459
+ (0): Sequential(
460
+ (0): LeakyReLU(negative_slope=0.1)
461
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
462
+ )
463
+ (1): Sequential(
464
+ (0): LeakyReLU(negative_slope=0.1)
465
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
466
+ )
467
+ (2): Sequential(
468
+ (0): LeakyReLU(negative_slope=0.1)
469
+ (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
470
+ )
471
+ )
472
+ )
473
+ (6): ResidualBlock(
474
+ (convs1): ModuleList(
475
+ (0): Sequential(
476
+ (0): LeakyReLU(negative_slope=0.1)
477
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
478
+ )
479
+ (1): Sequential(
480
+ (0): LeakyReLU(negative_slope=0.1)
481
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
482
+ )
483
+ (2): Sequential(
484
+ (0): LeakyReLU(negative_slope=0.1)
485
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
486
+ )
487
+ )
488
+ (convs2): ModuleList(
489
+ (0): Sequential(
490
+ (0): LeakyReLU(negative_slope=0.1)
491
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
492
+ )
493
+ (1): Sequential(
494
+ (0): LeakyReLU(negative_slope=0.1)
495
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
496
+ )
497
+ (2): Sequential(
498
+ (0): LeakyReLU(negative_slope=0.1)
499
+ (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
500
+ )
501
+ )
502
+ )
503
+ (7): ResidualBlock(
504
+ (convs1): ModuleList(
505
+ (0): Sequential(
506
+ (0): LeakyReLU(negative_slope=0.1)
507
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
508
+ )
509
+ (1): Sequential(
510
+ (0): LeakyReLU(negative_slope=0.1)
511
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
512
+ )
513
+ (2): Sequential(
514
+ (0): LeakyReLU(negative_slope=0.1)
515
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
516
+ )
517
+ )
518
+ (convs2): ModuleList(
519
+ (0): Sequential(
520
+ (0): LeakyReLU(negative_slope=0.1)
521
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
522
+ )
523
+ (1): Sequential(
524
+ (0): LeakyReLU(negative_slope=0.1)
525
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
526
+ )
527
+ (2): Sequential(
528
+ (0): LeakyReLU(negative_slope=0.1)
529
+ (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
530
+ )
531
+ )
532
+ )
533
+ (8): ResidualBlock(
534
+ (convs1): ModuleList(
535
+ (0): Sequential(
536
+ (0): LeakyReLU(negative_slope=0.1)
537
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
538
+ )
539
+ (1): Sequential(
540
+ (0): LeakyReLU(negative_slope=0.1)
541
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
542
+ )
543
+ (2): Sequential(
544
+ (0): LeakyReLU(negative_slope=0.1)
545
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
546
+ )
547
+ )
548
+ (convs2): ModuleList(
549
+ (0): Sequential(
550
+ (0): LeakyReLU(negative_slope=0.1)
551
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
552
+ )
553
+ (1): Sequential(
554
+ (0): LeakyReLU(negative_slope=0.1)
555
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
556
+ )
557
+ (2): Sequential(
558
+ (0): LeakyReLU(negative_slope=0.1)
559
+ (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
560
+ )
561
+ )
562
+ )
563
+ (9): ResidualBlock(
564
+ (convs1): ModuleList(
565
+ (0): Sequential(
566
+ (0): LeakyReLU(negative_slope=0.1)
567
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
568
+ )
569
+ (1): Sequential(
570
+ (0): LeakyReLU(negative_slope=0.1)
571
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
572
+ )
573
+ (2): Sequential(
574
+ (0): LeakyReLU(negative_slope=0.1)
575
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
576
+ )
577
+ )
578
+ (convs2): ModuleList(
579
+ (0): Sequential(
580
+ (0): LeakyReLU(negative_slope=0.1)
581
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
582
+ )
583
+ (1): Sequential(
584
+ (0): LeakyReLU(negative_slope=0.1)
585
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
586
+ )
587
+ (2): Sequential(
588
+ (0): LeakyReLU(negative_slope=0.1)
589
+ (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
590
+ )
591
+ )
592
+ )
593
+ (10): ResidualBlock(
594
+ (convs1): ModuleList(
595
+ (0): Sequential(
596
+ (0): LeakyReLU(negative_slope=0.1)
597
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
598
+ )
599
+ (1): Sequential(
600
+ (0): LeakyReLU(negative_slope=0.1)
601
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
602
+ )
603
+ (2): Sequential(
604
+ (0): LeakyReLU(negative_slope=0.1)
605
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
606
+ )
607
+ )
608
+ (convs2): ModuleList(
609
+ (0): Sequential(
610
+ (0): LeakyReLU(negative_slope=0.1)
611
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
612
+ )
613
+ (1): Sequential(
614
+ (0): LeakyReLU(negative_slope=0.1)
615
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
616
+ )
617
+ (2): Sequential(
618
+ (0): LeakyReLU(negative_slope=0.1)
619
+ (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
620
+ )
621
+ )
622
+ )
623
+ (11): ResidualBlock(
624
+ (convs1): ModuleList(
625
+ (0): Sequential(
626
+ (0): LeakyReLU(negative_slope=0.1)
627
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
628
+ )
629
+ (1): Sequential(
630
+ (0): LeakyReLU(negative_slope=0.1)
631
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
632
+ )
633
+ (2): Sequential(
634
+ (0): LeakyReLU(negative_slope=0.1)
635
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
636
+ )
637
+ )
638
+ (convs2): ModuleList(
639
+ (0): Sequential(
640
+ (0): LeakyReLU(negative_slope=0.1)
641
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
642
+ )
643
+ (1): Sequential(
644
+ (0): LeakyReLU(negative_slope=0.1)
645
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
646
+ )
647
+ (2): Sequential(
648
+ (0): LeakyReLU(negative_slope=0.1)
649
+ (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
650
+ )
651
+ )
652
+ )
653
+ )
654
+ (output_conv): Sequential(
655
+ (0): LeakyReLU(negative_slope=0.01)
656
+ (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
657
+ (2): Tanh()
658
+ )
659
+ )
660
+ )
661
+ (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
662
+ (msd): HiFiGANMultiScaleDiscriminator(
663
+ (discriminators): ModuleList(
664
+ (0): HiFiGANScaleDiscriminator(
665
+ (layers): ModuleList(
666
+ (0): Sequential(
667
+ (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
668
+ (1): LeakyReLU(negative_slope=0.1)
669
+ )
670
+ (1): Sequential(
671
+ (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
672
+ (1): LeakyReLU(negative_slope=0.1)
673
+ )
674
+ (2): Sequential(
675
+ (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
676
+ (1): LeakyReLU(negative_slope=0.1)
677
+ )
678
+ (3): Sequential(
679
+ (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
680
+ (1): LeakyReLU(negative_slope=0.1)
681
+ )
682
+ (4): Sequential(
683
+ (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
684
+ (1): LeakyReLU(negative_slope=0.1)
685
+ )
686
+ (5): Sequential(
687
+ (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
688
+ (1): LeakyReLU(negative_slope=0.1)
689
+ )
690
+ (6): Sequential(
691
+ (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
692
+ (1): LeakyReLU(negative_slope=0.1)
693
+ )
694
+ (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
695
+ )
696
+ )
697
+ )
698
+ )
699
+ (mpd): HiFiGANMultiPeriodDiscriminator(
700
+ (discriminators): ModuleList(
701
+ (0): HiFiGANPeriodDiscriminator(
702
+ (convs): ModuleList(
703
+ (0): Sequential(
704
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
705
+ (1): LeakyReLU(negative_slope=0.1)
706
+ )
707
+ (1): Sequential(
708
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
709
+ (1): LeakyReLU(negative_slope=0.1)
710
+ )
711
+ (2): Sequential(
712
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
713
+ (1): LeakyReLU(negative_slope=0.1)
714
+ )
715
+ (3): Sequential(
716
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
717
+ (1): LeakyReLU(negative_slope=0.1)
718
+ )
719
+ (4): Sequential(
720
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
721
+ (1): LeakyReLU(negative_slope=0.1)
722
+ )
723
+ )
724
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
725
+ )
726
+ (1): HiFiGANPeriodDiscriminator(
727
+ (convs): ModuleList(
728
+ (0): Sequential(
729
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
730
+ (1): LeakyReLU(negative_slope=0.1)
731
+ )
732
+ (1): Sequential(
733
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
734
+ (1): LeakyReLU(negative_slope=0.1)
735
+ )
736
+ (2): Sequential(
737
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
738
+ (1): LeakyReLU(negative_slope=0.1)
739
+ )
740
+ (3): Sequential(
741
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
742
+ (1): LeakyReLU(negative_slope=0.1)
743
+ )
744
+ (4): Sequential(
745
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
746
+ (1): LeakyReLU(negative_slope=0.1)
747
+ )
748
+ )
749
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
750
+ )
751
+ (2): HiFiGANPeriodDiscriminator(
752
+ (convs): ModuleList(
753
+ (0): Sequential(
754
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
755
+ (1): LeakyReLU(negative_slope=0.1)
756
+ )
757
+ (1): Sequential(
758
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
759
+ (1): LeakyReLU(negative_slope=0.1)
760
+ )
761
+ (2): Sequential(
762
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
763
+ (1): LeakyReLU(negative_slope=0.1)
764
+ )
765
+ (3): Sequential(
766
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
767
+ (1): LeakyReLU(negative_slope=0.1)
768
+ )
769
+ (4): Sequential(
770
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
771
+ (1): LeakyReLU(negative_slope=0.1)
772
+ )
773
+ )
774
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
775
+ )
776
+ (3): HiFiGANPeriodDiscriminator(
777
+ (convs): ModuleList(
778
+ (0): Sequential(
779
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
780
+ (1): LeakyReLU(negative_slope=0.1)
781
+ )
782
+ (1): Sequential(
783
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
784
+ (1): LeakyReLU(negative_slope=0.1)
785
+ )
786
+ (2): Sequential(
787
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
788
+ (1): LeakyReLU(negative_slope=0.1)
789
+ )
790
+ (3): Sequential(
791
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
792
+ (1): LeakyReLU(negative_slope=0.1)
793
+ )
794
+ (4): Sequential(
795
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
796
+ (1): LeakyReLU(negative_slope=0.1)
797
+ )
798
+ )
799
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
800
+ )
801
+ (4): HiFiGANPeriodDiscriminator(
802
+ (convs): ModuleList(
803
+ (0): Sequential(
804
+ (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
805
+ (1): LeakyReLU(negative_slope=0.1)
806
+ )
807
+ (1): Sequential(
808
+ (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
809
+ (1): LeakyReLU(negative_slope=0.1)
810
+ )
811
+ (2): Sequential(
812
+ (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
813
+ (1): LeakyReLU(negative_slope=0.1)
814
+ )
815
+ (3): Sequential(
816
+ (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
817
+ (1): LeakyReLU(negative_slope=0.1)
818
+ )
819
+ (4): Sequential(
820
+ (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
821
+ (1): LeakyReLU(negative_slope=0.1)
822
+ )
823
+ )
824
+ (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
825
+ )
826
+ )
827
+ )
828
+ )
829
+ (generator_adv_loss): GeneratorAdversarialLoss()
830
+ (discriminator_adv_loss): DiscriminatorAdversarialLoss()
831
+ (feat_match_loss): FeatureMatchLoss()
832
+ (mel_loss): MelSpectrogramLoss(
833
+ (wav_to_mel): LogMelFbank(
834
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
835
+ (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False)
836
+ )
837
+ )
838
+ (var_loss): VarianceLoss(
839
+ (mse_criterion): MSELoss()
840
+ (duration_criterion): DurationPredictorLoss(
841
+ (criterion): MSELoss()
842
+ )
843
+ )
844
+ (forwardsum_loss): ForwardSumLoss()
845
+ )
846
+ )
847
+
848
+ Model summary:
849
+ Class Name: ESPnetGANTTSModel
850
+ Total Number of model parameters: 83.28 M
851
+ Number of trainable parameters: 83.28 M (100.0%)
852
+ Size: 333.11 MB
853
+ Type: torch.float32
854
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer:
855
+ AdamW (
856
+ Parameter Group 0
857
+ amsgrad: False
858
+ betas: [0.8, 0.99]
859
+ eps: 1e-09
860
+ initial_lr: 0.0002
861
+ lr: 0.0002
862
+ weight_decay: 0.0
863
+ )
864
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f5660199550>
865
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer2:
866
+ AdamW (
867
+ Parameter Group 0
868
+ amsgrad: False
869
+ betas: [0.8, 0.99]
870
+ eps: 1e-09
871
+ initial_lr: 0.0002
872
+ lr: 0.0002
873
+ weight_decay: 0.0
874
+ )
875
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f5747efa9d0>
876
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1171) INFO: Saving the configuration in exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml
877
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1525) INFO: [train] dataset:
878
+ ESPnetDataset(
879
+ text: {"path": "dump/raw/jvs010_tr_no_dev/text", "type": "text"}
880
+ speech: {"path": "dump/raw/jvs010_tr_no_dev/wav.scp", "type": "sound"}
881
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660199dc0>)
882
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1526) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=4, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending)
883
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,699 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=4, mean=25.0, min=5, max=41
884
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1525) INFO: [valid] dataset:
885
+ ESPnetDataset(
886
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
887
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
888
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660199520>)
889
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1526) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=1, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending)
890
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=1, mean=15.0, min=15, max=15
891
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1525) INFO: [plot_att] dataset:
892
+ ESPnetDataset(
893
+ text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
894
+ speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
895
+ preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660155130>)
896
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15, batch_size=1, key_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn,
897
+ [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1
898
+ 92b100c97f43:1159464:1159464 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
899
+ 92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
900
+
901
+ 92b100c97f43:1159464:1159464 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
902
+ 92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
903
+ 92b100c97f43:1159464:1159464 [0] NCCL INFO Using network Socket
904
+ NCCL version 2.10.3+cuda11.3
905
+ 92b100c97f43:1159466:1159466 [2] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
906
+ 92b100c97f43:1159465:1159465 [1] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
907
+ 92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
908
+ 92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
909
+
910
+ 92b100c97f43:1159466:1159466 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
911
+
912
+ 92b100c97f43:1159465:1159465 [1] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
913
+ 92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
914
+ 92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
915
+ 92b100c97f43:1159465:1159465 [1] NCCL INFO Using network Socket
916
+ 92b100c97f43:1159466:1159466 [2] NCCL INFO Using network Socket
917
+ 92b100c97f43:1159467:1159467 [3] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
918
+ 92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
919
+
920
+ 92b100c97f43:1159467:1159467 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
921
+ 92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
922
+ 92b100c97f43:1159467:1159467 [3] NCCL INFO Using network Socket
923
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00/02 : 0 1 2 3
924
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
925
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
926
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01/02 : 0 1 2 3
927
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
928
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff
929
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
930
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff
931
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff
932
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff
933
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 3[60] via direct shared memory
934
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 0[30] via direct shared memory
935
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00 : 0[30] -> 1[40] via direct shared memory
936
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 2[50] via direct shared memory
937
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 3[60] via direct shared memory
938
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 0[30] via direct shared memory
939
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01 : 0[30] -> 1[40] via direct shared memory
940
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 2[50] via direct shared memory
941
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all rings
942
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all rings
943
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all rings
944
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all rings
945
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 2[50] via direct shared memory
946
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 2[50] via direct shared memory
947
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 1[40] via direct shared memory
948
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 0[30] via direct shared memory
949
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 1[40] via direct shared memory
950
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 0[30] via direct shared memory
951
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all trees
952
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
953
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
954
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all trees
955
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
956
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
957
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all trees
958
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
959
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
960
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all trees
961
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
962
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
963
+ 92b100c97f43:1159466:1159503 [2] NCCL INFO comm 0x7f35f80030d0 rank 2 nranks 4 cudaDev 2 busId 50 - Init COMPLETE
964
+ 92b100c97f43:1159464:1159502 [0] NCCL INFO comm 0x7f55500030d0 rank 0 nranks 4 cudaDev 0 busId 30 - Init COMPLETE
965
+ 92b100c97f43:1159464:1159464 [0] NCCL INFO Launch mode Parallel
966
+ 92b100c97f43:1159465:1159504 [1] NCCL INFO comm 0x7f97600030d0 rank 1 nranks 4 cudaDev 1 busId 40 - Init COMPLETE
967
+ 92b100c97f43:1159467:1159505 [3] NCCL INFO comm 0x7f66b80030d0 rank 3 nranks 4 cudaDev 3 busId 60 - Init COMPLETE
968
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
969
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
970
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
971
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
972
+ [92b100c97f43:0/4] 2025-03-04 21:23:55,188 (trainer:280) INFO: 1/130epoch started
973
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
974
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
975
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
976
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
977
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
978
+ [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
979
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
980
+ olens = (ilens - self.n_fft) // self.hop_length + 1
981
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
982
+ olens = (ilens - self.n_fft) // self.hop_length + 1
983
+ [92b100c97f43:0/4] 2025-03-04 21:25:36,523 (gan_trainer:305) INFO: 1epoch:train:1-50batch: iter_time=0.048, generator_forward_time=0.723, generator_loss=139.743, generator_g_loss=110.582, generator_var_loss=5.224, generator_align_loss=23.937, generator_g_mel_loss=106.758, generator_g_adv_loss=2.179, generator_g_feat_match_loss=1.645, generator_var_dur_loss=0.584, generator_var_pitch_loss=2.400, generator_var_energy_loss=2.240, generator_align_forwardsum_loss=10.599, generator_align_bin_loss=1.369, generator_backward_time=0.254, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.113, discriminator_forward_time=0.544, discriminator_loss=2.766, discriminator_real_loss=1.518, discriminator_fake_loss=1.247, discriminator_backward_time=0.198, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.788, train_time=2.024
984
+ [92b100c97f43:0/4] 2025-03-04 21:27:08,245 (gan_trainer:305) INFO: 1epoch:train:51-100batch: iter_time=1.198e-04, generator_forward_time=0.634, generator_loss=111.648, generator_g_loss=85.935, generator_var_loss=2.179, generator_align_loss=23.534, generator_g_mel_loss=80.251, generator_g_adv_loss=2.332, generator_g_feat_match_loss=3.352, generator_var_dur_loss=0.089, generator_var_pitch_loss=0.924, generator_var_energy_loss=1.166, generator_align_forwardsum_loss=10.437, generator_align_bin_loss=1.330, generator_backward_time=0.258, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.027, discriminator_forward_time=0.548, discriminator_loss=2.396, discriminator_real_loss=1.381, discriminator_fake_loss=1.015, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.796, train_time=1.835
985
+ [92b100c97f43:0/4] 2025-03-04 21:28:38,897 (gan_trainer:305) INFO: 1epoch:train:101-150batch: iter_time=1.203e-04, generator_forward_time=0.624, generator_loss=112.406, generator_g_loss=87.597, generator_var_loss=1.890, generator_align_loss=22.919, generator_g_mel_loss=80.508, generator_g_adv_loss=2.744, generator_g_feat_match_loss=4.346, generator_var_dur_loss=0.058, generator_var_pitch_loss=0.808, generator_var_energy_loss=1.024, generator_align_forwardsum_loss=10.071, generator_align_bin_loss=1.389, generator_backward_time=0.257, generator_optim_step_time=0.033, optim0_lr0=2.000e-04, generator_train_time=1.015, discriminator_forward_time=0.539, discriminator_loss=2.084, discriminator_real_loss=1.319, discriminator_fake_loss=0.765, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.787, train_time=1.813
986
+ [92b100c97f43:0/4] 2025-03-04 21:30:10,556 (gan_trainer:305) INFO: 1epoch:train:151-200batch: iter_time=1.211e-04, generator_forward_time=0.634, generator_loss=111.480, generator_g_loss=87.402, generator_var_loss=1.820, generator_align_loss=22.257, generator_g_mel_loss=79.313, generator_g_adv_loss=2.954, generator_g_feat_match_loss=5.134, generator_var_dur_loss=0.061, generator_var_pitch_loss=0.765, generator_var_energy_loss=0.994, generator_align_forwardsum_loss=9.687, generator_align_bin_loss=1.442, generator_backward_time=0.258, generator_optim_step_time=0.033, optim0_lr0=2.000e-04, generator_train_time=1.026, discriminator_forward_time=0.548, discriminator_loss=1.898, discriminator_real_loss=1.237, discriminator_fake_loss=0.661, discriminator_backward_time=0.202, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.796, train_time=1.833
987
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
988
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
989
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
990
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
991
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
992
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
993
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
994
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
995
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
996
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
997
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
998
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
999
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1000
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1001
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1002
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1003
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1004
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1005
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1006
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1007
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1008
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1009
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1010
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1011
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1012
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1013
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1014
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1015
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1016
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1017
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1018
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1019
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1020
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1021
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1022
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1023
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1024
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1025
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1026
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1027
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1028
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1029
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1030
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1031
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1032
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1033
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1034
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1035
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1036
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1037
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1038
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1039
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1040
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1041
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1042
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1043
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1044
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1045
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1046
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1047
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1048
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1049
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1050
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1051
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1052
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1053
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1054
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1055
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1056
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1057
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1058
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1059
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1060
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1061
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1062
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1063
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1064
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1065
+ Traceback (most recent call last):
1066
+ File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
1067
+ return _run_code(code, main_globals, None,
1068
+ File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
1069
+ exec(code, run_globals)
1070
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 22, in <module>
1071
+ main()
1072
+ File "/work/espnet/espnet2/bin/gan_tts_train.py", line 18, in main
1073
+ GANTTSTask.main(cmd=cmd)
1074
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1069, in main
1075
+ while not ProcessContext(processes, error_queues).join():
1076
+ File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 99, in join
1077
+ ready = multiprocessing.connection.wait(
1078
+ File "/usr/lib/python3.8/multiprocessing/connection.py", line 931, in wait
1079
+ ready = selector.select(timeout)
1080
+ File "/usr/lib/python3.8/selectors.py", line 415, in select
1081
+ fd_event_list = self._selector.poll(timeout)
1082
+ KeyboardInterrupt
1083
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1084
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1085
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1086
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1087
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1088
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1089
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1090
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1091
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1092
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1093
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1094
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1095
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1096
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1097
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1098
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1099
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1100
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1101
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1102
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1103
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1104
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1105
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1106
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1107
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1108
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1109
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1110
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1111
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1112
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1113
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1114
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1115
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1116
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1117
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1118
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1119
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1120
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1121
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1122
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1123
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1124
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1125
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1126
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1127
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1128
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1129
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1130
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1131
+ /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
1132
+ warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
1133
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1134
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1135
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1136
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1137
+ Process SpawnProcess-2:
1138
+ Traceback (most recent call last):
1139
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1140
+ self.run()
1141
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1142
+ self._target(*self._args, **self._kwargs)
1143
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1144
+ cls.trainer.run(
1145
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1146
+ all_steps_are_invalid = cls.train_one_epoch(
1147
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 223, in train_one_epoch
1148
+ loss.backward()
1149
+ File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 307, in backward
1150
+ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
1151
+ File "/usr/local/lib/python3.8/dist-packages/torch/autograd/__init__.py", line 154, in backward
1152
+ Variable._execution_engine.run_backward(
1153
+ KeyboardInterrupt
1154
+ Process SpawnProcess-1:
1155
+ Traceback (most recent call last):
1156
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1157
+ self.run()
1158
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1159
+ self._target(*self._args, **self._kwargs)
1160
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1161
+ cls.trainer.run(
1162
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1163
+ all_steps_are_invalid = cls.train_one_epoch(
1164
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 223, in train_one_epoch
1165
+ loss.backward()
1166
+ File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 307, in backward
1167
+ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
1168
+ File "/usr/local/lib/python3.8/dist-packages/torch/autograd/__init__.py", line 154, in backward
1169
+ Variable._execution_engine.run_backward(
1170
+ KeyboardInterrupt
1171
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1172
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1173
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1174
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1175
+ Process SpawnProcess-3:
1176
+ Traceback (most recent call last):
1177
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1178
+ self.run()
1179
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1180
+ self._target(*self._args, **self._kwargs)
1181
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1182
+ cls.trainer.run(
1183
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1184
+ all_steps_are_invalid = cls.train_one_epoch(
1185
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 223, in train_one_epoch
1186
+ loss.backward()
1187
+ File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 307, in backward
1188
+ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
1189
+ File "/usr/local/lib/python3.8/dist-packages/torch/autograd/__init__.py", line 154, in backward
1190
+ Variable._execution_engine.run_backward(
1191
+ KeyboardInterrupt
1192
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1193
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1194
+ /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
1195
+ olens = (ilens - self.n_fft) // self.hop_length + 1
1196
+ Process SpawnProcess-4:
1197
+ Traceback (most recent call last):
1198
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
1199
+ self.run()
1200
+ File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
1201
+ self._target(*self._args, **self._kwargs)
1202
+ File "/work/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker
1203
+ cls.trainer.run(
1204
+ File "/work/espnet/espnet2/train/trainer.py", line 286, in run
1205
+ all_steps_are_invalid = cls.train_one_epoch(
1206
+ File "/work/espnet/espnet2/train/gan_trainer.py", line 223, in train_one_epoch
1207
+ loss.backward()
1208
+ File "/usr/local/lib/python3.8/dist-packages/torch/_tensor.py", line 307, in backward
1209
+ torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
1210
+ File "/usr/local/lib/python3.8/dist-packages/torch/autograd/__init__.py", line 154, in backward
1211
+ Variable._execution_engine.run_backward(
1212
+ KeyboardInterrupt
exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.log CHANGED
@@ -1,985 +1,13 @@
1
  # python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
2
- # Started at Tue Mar 4 21:23:38 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
7
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
8
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
9
- [92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
10
- [92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
11
- [92b100c97f43:0/4] 2025-03-04 21:23:44,896 (gan_tts:304) INFO: Vocabulary size: 41
12
- [92b100c97f43:0/4] 2025-03-04 21:23:45,027 (encoder:172) INFO: encoder self-attention layer type = self-attention
13
- [92b100c97f43:0/4] 2025-03-04 21:23:45,249 (encoder:172) INFO: encoder self-attention layer type = self-attention
14
- [92b100c97f43:0/4] 2025-03-04 21:23:54,351 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False
15
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1158) INFO: Model structure:
16
- ESPnetGANTTSModel(
17
- (feats_extract): LogMelFbank(
18
- (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
19
- (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False)
20
- )
21
- (normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz, norm_means=True, norm_vars=True)
22
- (pitch_extract): Dio()
23
- (pitch_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz, norm_means=True, norm_vars=True)
24
- (energy_extract): Energy(
25
- (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True)
26
- )
27
- (energy_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz, norm_means=True, norm_vars=True)
28
- (tts): JETS(
29
- (generator): JETSGenerator(
30
- (encoder): Encoder(
31
- (embed): Sequential(
32
- (0): Embedding(41, 256, padding_idx=0)
33
- (1): ScaledPositionalEncoding(
34
- (dropout): Dropout(p=0.2, inplace=False)
35
- )
36
- )
37
- (encoders): MultiSequential(
38
- (0): EncoderLayer(
39
- (self_attn): MultiHeadedAttention(
40
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
41
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
42
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
43
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
44
- (dropout): Dropout(p=0.2, inplace=False)
45
- )
46
- (feed_forward): MultiLayeredConv1d(
47
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
48
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
49
- (dropout): Dropout(p=0.2, inplace=False)
50
- )
51
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
52
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
53
- (dropout): Dropout(p=0.2, inplace=False)
54
- )
55
- (1): EncoderLayer(
56
- (self_attn): MultiHeadedAttention(
57
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
58
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
59
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
60
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
61
- (dropout): Dropout(p=0.2, inplace=False)
62
- )
63
- (feed_forward): MultiLayeredConv1d(
64
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
65
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
66
- (dropout): Dropout(p=0.2, inplace=False)
67
- )
68
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
69
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
70
- (dropout): Dropout(p=0.2, inplace=False)
71
- )
72
- (2): EncoderLayer(
73
- (self_attn): MultiHeadedAttention(
74
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
75
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
76
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
77
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
78
- (dropout): Dropout(p=0.2, inplace=False)
79
- )
80
- (feed_forward): MultiLayeredConv1d(
81
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
82
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
83
- (dropout): Dropout(p=0.2, inplace=False)
84
- )
85
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
86
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
87
- (dropout): Dropout(p=0.2, inplace=False)
88
- )
89
- (3): EncoderLayer(
90
- (self_attn): MultiHeadedAttention(
91
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
92
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
93
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
94
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
95
- (dropout): Dropout(p=0.2, inplace=False)
96
- )
97
- (feed_forward): MultiLayeredConv1d(
98
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
99
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
100
- (dropout): Dropout(p=0.2, inplace=False)
101
- )
102
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
103
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
104
- (dropout): Dropout(p=0.2, inplace=False)
105
- )
106
- )
107
- (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
108
- )
109
- (duration_predictor): DurationPredictor(
110
- (conv): ModuleList(
111
- (0): Sequential(
112
- (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
113
- (1): ReLU()
114
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
115
- (3): Dropout(p=0.1, inplace=False)
116
- )
117
- (1): Sequential(
118
- (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
119
- (1): ReLU()
120
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
121
- (3): Dropout(p=0.1, inplace=False)
122
- )
123
- )
124
- (linear): Linear(in_features=256, out_features=1, bias=True)
125
- )
126
- (pitch_predictor): VariancePredictor(
127
- (conv): ModuleList(
128
- (0): Sequential(
129
- (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
130
- (1): ReLU()
131
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
132
- (3): Dropout(p=0.5, inplace=False)
133
- )
134
- (1): Sequential(
135
- (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
136
- (1): ReLU()
137
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
138
- (3): Dropout(p=0.5, inplace=False)
139
- )
140
- (2): Sequential(
141
- (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
142
- (1): ReLU()
143
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
144
- (3): Dropout(p=0.5, inplace=False)
145
- )
146
- (3): Sequential(
147
- (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
148
- (1): ReLU()
149
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
150
- (3): Dropout(p=0.5, inplace=False)
151
- )
152
- (4): Sequential(
153
- (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,))
154
- (1): ReLU()
155
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
156
- (3): Dropout(p=0.5, inplace=False)
157
- )
158
- )
159
- (linear): Linear(in_features=256, out_features=1, bias=True)
160
- )
161
- (pitch_embed): Sequential(
162
- (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
163
- (1): Dropout(p=0.0, inplace=False)
164
- )
165
- (energy_predictor): VariancePredictor(
166
- (conv): ModuleList(
167
- (0): Sequential(
168
- (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
169
- (1): ReLU()
170
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
171
- (3): Dropout(p=0.5, inplace=False)
172
- )
173
- (1): Sequential(
174
- (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
175
- (1): ReLU()
176
- (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
177
- (3): Dropout(p=0.5, inplace=False)
178
- )
179
- )
180
- (linear): Linear(in_features=256, out_features=1, bias=True)
181
- )
182
- (energy_embed): Sequential(
183
- (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,))
184
- (1): Dropout(p=0.0, inplace=False)
185
- )
186
- (alignment_module): AlignmentModule(
187
- (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
188
- (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
189
- (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,))
190
- (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
191
- (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
192
- )
193
- (length_regulator): GaussianUpsampling()
194
- (decoder): Encoder(
195
- (embed): Sequential(
196
- (0): ScaledPositionalEncoding(
197
- (dropout): Dropout(p=0.2, inplace=False)
198
- )
199
- )
200
- (encoders): MultiSequential(
201
- (0): EncoderLayer(
202
- (self_attn): MultiHeadedAttention(
203
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
204
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
205
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
206
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
207
- (dropout): Dropout(p=0.2, inplace=False)
208
- )
209
- (feed_forward): MultiLayeredConv1d(
210
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
211
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
212
- (dropout): Dropout(p=0.2, inplace=False)
213
- )
214
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
215
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
216
- (dropout): Dropout(p=0.2, inplace=False)
217
- )
218
- (1): EncoderLayer(
219
- (self_attn): MultiHeadedAttention(
220
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
221
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
222
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
223
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
224
- (dropout): Dropout(p=0.2, inplace=False)
225
- )
226
- (feed_forward): MultiLayeredConv1d(
227
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
228
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
229
- (dropout): Dropout(p=0.2, inplace=False)
230
- )
231
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
232
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
233
- (dropout): Dropout(p=0.2, inplace=False)
234
- )
235
- (2): EncoderLayer(
236
- (self_attn): MultiHeadedAttention(
237
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
238
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
239
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
240
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
241
- (dropout): Dropout(p=0.2, inplace=False)
242
- )
243
- (feed_forward): MultiLayeredConv1d(
244
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
245
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
246
- (dropout): Dropout(p=0.2, inplace=False)
247
- )
248
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
249
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
250
- (dropout): Dropout(p=0.2, inplace=False)
251
- )
252
- (3): EncoderLayer(
253
- (self_attn): MultiHeadedAttention(
254
- (linear_q): Linear(in_features=256, out_features=256, bias=True)
255
- (linear_k): Linear(in_features=256, out_features=256, bias=True)
256
- (linear_v): Linear(in_features=256, out_features=256, bias=True)
257
- (linear_out): Linear(in_features=256, out_features=256, bias=True)
258
- (dropout): Dropout(p=0.2, inplace=False)
259
- )
260
- (feed_forward): MultiLayeredConv1d(
261
- (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
262
- (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,))
263
- (dropout): Dropout(p=0.2, inplace=False)
264
- )
265
- (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
266
- (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
267
- (dropout): Dropout(p=0.2, inplace=False)
268
- )
269
- )
270
- (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
271
- )
272
- (generator): HiFiGANGenerator(
273
- (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,))
274
- (upsamples): ModuleList(
275
- (0): Sequential(
276
- (0): LeakyReLU(negative_slope=0.1)
277
- (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,))
278
- )
279
- (1): Sequential(
280
- (0): LeakyReLU(negative_slope=0.1)
281
- (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,))
282
- )
283
- (2): Sequential(
284
- (0): LeakyReLU(negative_slope=0.1)
285
- (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,))
286
- )
287
- (3): Sequential(
288
- (0): LeakyReLU(negative_slope=0.1)
289
- (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,))
290
- )
291
- )
292
- (blocks): ModuleList(
293
- (0): ResidualBlock(
294
- (convs1): ModuleList(
295
- (0): Sequential(
296
- (0): LeakyReLU(negative_slope=0.1)
297
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
298
- )
299
- (1): Sequential(
300
- (0): LeakyReLU(negative_slope=0.1)
301
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
302
- )
303
- (2): Sequential(
304
- (0): LeakyReLU(negative_slope=0.1)
305
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
306
- )
307
- )
308
- (convs2): ModuleList(
309
- (0): Sequential(
310
- (0): LeakyReLU(negative_slope=0.1)
311
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
312
- )
313
- (1): Sequential(
314
- (0): LeakyReLU(negative_slope=0.1)
315
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
316
- )
317
- (2): Sequential(
318
- (0): LeakyReLU(negative_slope=0.1)
319
- (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
320
- )
321
- )
322
- )
323
- (1): ResidualBlock(
324
- (convs1): ModuleList(
325
- (0): Sequential(
326
- (0): LeakyReLU(negative_slope=0.1)
327
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
328
- )
329
- (1): Sequential(
330
- (0): LeakyReLU(negative_slope=0.1)
331
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
332
- )
333
- (2): Sequential(
334
- (0): LeakyReLU(negative_slope=0.1)
335
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
336
- )
337
- )
338
- (convs2): ModuleList(
339
- (0): Sequential(
340
- (0): LeakyReLU(negative_slope=0.1)
341
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
342
- )
343
- (1): Sequential(
344
- (0): LeakyReLU(negative_slope=0.1)
345
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
346
- )
347
- (2): Sequential(
348
- (0): LeakyReLU(negative_slope=0.1)
349
- (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,))
350
- )
351
- )
352
- )
353
- (2): ResidualBlock(
354
- (convs1): ModuleList(
355
- (0): Sequential(
356
- (0): LeakyReLU(negative_slope=0.1)
357
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
358
- )
359
- (1): Sequential(
360
- (0): LeakyReLU(negative_slope=0.1)
361
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
362
- )
363
- (2): Sequential(
364
- (0): LeakyReLU(negative_slope=0.1)
365
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
366
- )
367
- )
368
- (convs2): ModuleList(
369
- (0): Sequential(
370
- (0): LeakyReLU(negative_slope=0.1)
371
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
372
- )
373
- (1): Sequential(
374
- (0): LeakyReLU(negative_slope=0.1)
375
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
376
- )
377
- (2): Sequential(
378
- (0): LeakyReLU(negative_slope=0.1)
379
- (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,))
380
- )
381
- )
382
- )
383
- (3): ResidualBlock(
384
- (convs1): ModuleList(
385
- (0): Sequential(
386
- (0): LeakyReLU(negative_slope=0.1)
387
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
388
- )
389
- (1): Sequential(
390
- (0): LeakyReLU(negative_slope=0.1)
391
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
392
- )
393
- (2): Sequential(
394
- (0): LeakyReLU(negative_slope=0.1)
395
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
396
- )
397
- )
398
- (convs2): ModuleList(
399
- (0): Sequential(
400
- (0): LeakyReLU(negative_slope=0.1)
401
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
402
- )
403
- (1): Sequential(
404
- (0): LeakyReLU(negative_slope=0.1)
405
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
406
- )
407
- (2): Sequential(
408
- (0): LeakyReLU(negative_slope=0.1)
409
- (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
410
- )
411
- )
412
- )
413
- (4): ResidualBlock(
414
- (convs1): ModuleList(
415
- (0): Sequential(
416
- (0): LeakyReLU(negative_slope=0.1)
417
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
418
- )
419
- (1): Sequential(
420
- (0): LeakyReLU(negative_slope=0.1)
421
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
422
- )
423
- (2): Sequential(
424
- (0): LeakyReLU(negative_slope=0.1)
425
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
426
- )
427
- )
428
- (convs2): ModuleList(
429
- (0): Sequential(
430
- (0): LeakyReLU(negative_slope=0.1)
431
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
432
- )
433
- (1): Sequential(
434
- (0): LeakyReLU(negative_slope=0.1)
435
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
436
- )
437
- (2): Sequential(
438
- (0): LeakyReLU(negative_slope=0.1)
439
- (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
440
- )
441
- )
442
- )
443
- (5): ResidualBlock(
444
- (convs1): ModuleList(
445
- (0): Sequential(
446
- (0): LeakyReLU(negative_slope=0.1)
447
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
448
- )
449
- (1): Sequential(
450
- (0): LeakyReLU(negative_slope=0.1)
451
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
452
- )
453
- (2): Sequential(
454
- (0): LeakyReLU(negative_slope=0.1)
455
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
456
- )
457
- )
458
- (convs2): ModuleList(
459
- (0): Sequential(
460
- (0): LeakyReLU(negative_slope=0.1)
461
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
462
- )
463
- (1): Sequential(
464
- (0): LeakyReLU(negative_slope=0.1)
465
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
466
- )
467
- (2): Sequential(
468
- (0): LeakyReLU(negative_slope=0.1)
469
- (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,))
470
- )
471
- )
472
- )
473
- (6): ResidualBlock(
474
- (convs1): ModuleList(
475
- (0): Sequential(
476
- (0): LeakyReLU(negative_slope=0.1)
477
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
478
- )
479
- (1): Sequential(
480
- (0): LeakyReLU(negative_slope=0.1)
481
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
482
- )
483
- (2): Sequential(
484
- (0): LeakyReLU(negative_slope=0.1)
485
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
486
- )
487
- )
488
- (convs2): ModuleList(
489
- (0): Sequential(
490
- (0): LeakyReLU(negative_slope=0.1)
491
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
492
- )
493
- (1): Sequential(
494
- (0): LeakyReLU(negative_slope=0.1)
495
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
496
- )
497
- (2): Sequential(
498
- (0): LeakyReLU(negative_slope=0.1)
499
- (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
500
- )
501
- )
502
- )
503
- (7): ResidualBlock(
504
- (convs1): ModuleList(
505
- (0): Sequential(
506
- (0): LeakyReLU(negative_slope=0.1)
507
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
508
- )
509
- (1): Sequential(
510
- (0): LeakyReLU(negative_slope=0.1)
511
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
512
- )
513
- (2): Sequential(
514
- (0): LeakyReLU(negative_slope=0.1)
515
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
516
- )
517
- )
518
- (convs2): ModuleList(
519
- (0): Sequential(
520
- (0): LeakyReLU(negative_slope=0.1)
521
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
522
- )
523
- (1): Sequential(
524
- (0): LeakyReLU(negative_slope=0.1)
525
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
526
- )
527
- (2): Sequential(
528
- (0): LeakyReLU(negative_slope=0.1)
529
- (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
530
- )
531
- )
532
- )
533
- (8): ResidualBlock(
534
- (convs1): ModuleList(
535
- (0): Sequential(
536
- (0): LeakyReLU(negative_slope=0.1)
537
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
538
- )
539
- (1): Sequential(
540
- (0): LeakyReLU(negative_slope=0.1)
541
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
542
- )
543
- (2): Sequential(
544
- (0): LeakyReLU(negative_slope=0.1)
545
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
546
- )
547
- )
548
- (convs2): ModuleList(
549
- (0): Sequential(
550
- (0): LeakyReLU(negative_slope=0.1)
551
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
552
- )
553
- (1): Sequential(
554
- (0): LeakyReLU(negative_slope=0.1)
555
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
556
- )
557
- (2): Sequential(
558
- (0): LeakyReLU(negative_slope=0.1)
559
- (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,))
560
- )
561
- )
562
- )
563
- (9): ResidualBlock(
564
- (convs1): ModuleList(
565
- (0): Sequential(
566
- (0): LeakyReLU(negative_slope=0.1)
567
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
568
- )
569
- (1): Sequential(
570
- (0): LeakyReLU(negative_slope=0.1)
571
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
572
- )
573
- (2): Sequential(
574
- (0): LeakyReLU(negative_slope=0.1)
575
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,))
576
- )
577
- )
578
- (convs2): ModuleList(
579
- (0): Sequential(
580
- (0): LeakyReLU(negative_slope=0.1)
581
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
582
- )
583
- (1): Sequential(
584
- (0): LeakyReLU(negative_slope=0.1)
585
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
586
- )
587
- (2): Sequential(
588
- (0): LeakyReLU(negative_slope=0.1)
589
- (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,))
590
- )
591
- )
592
- )
593
- (10): ResidualBlock(
594
- (convs1): ModuleList(
595
- (0): Sequential(
596
- (0): LeakyReLU(negative_slope=0.1)
597
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
598
- )
599
- (1): Sequential(
600
- (0): LeakyReLU(negative_slope=0.1)
601
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
602
- )
603
- (2): Sequential(
604
- (0): LeakyReLU(negative_slope=0.1)
605
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,))
606
- )
607
- )
608
- (convs2): ModuleList(
609
- (0): Sequential(
610
- (0): LeakyReLU(negative_slope=0.1)
611
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
612
- )
613
- (1): Sequential(
614
- (0): LeakyReLU(negative_slope=0.1)
615
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
616
- )
617
- (2): Sequential(
618
- (0): LeakyReLU(negative_slope=0.1)
619
- (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,))
620
- )
621
- )
622
- )
623
- (11): ResidualBlock(
624
- (convs1): ModuleList(
625
- (0): Sequential(
626
- (0): LeakyReLU(negative_slope=0.1)
627
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
628
- )
629
- (1): Sequential(
630
- (0): LeakyReLU(negative_slope=0.1)
631
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,))
632
- )
633
- (2): Sequential(
634
- (0): LeakyReLU(negative_slope=0.1)
635
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,))
636
- )
637
- )
638
- (convs2): ModuleList(
639
- (0): Sequential(
640
- (0): LeakyReLU(negative_slope=0.1)
641
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
642
- )
643
- (1): Sequential(
644
- (0): LeakyReLU(negative_slope=0.1)
645
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
646
- )
647
- (2): Sequential(
648
- (0): LeakyReLU(negative_slope=0.1)
649
- (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,))
650
- )
651
- )
652
- )
653
- )
654
- (output_conv): Sequential(
655
- (0): LeakyReLU(negative_slope=0.01)
656
- (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,))
657
- (2): Tanh()
658
- )
659
- )
660
- )
661
- (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator(
662
- (msd): HiFiGANMultiScaleDiscriminator(
663
- (discriminators): ModuleList(
664
- (0): HiFiGANScaleDiscriminator(
665
- (layers): ModuleList(
666
- (0): Sequential(
667
- (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,))
668
- (1): LeakyReLU(negative_slope=0.1)
669
- )
670
- (1): Sequential(
671
- (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4)
672
- (1): LeakyReLU(negative_slope=0.1)
673
- )
674
- (2): Sequential(
675
- (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16)
676
- (1): LeakyReLU(negative_slope=0.1)
677
- )
678
- (3): Sequential(
679
- (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
680
- (1): LeakyReLU(negative_slope=0.1)
681
- )
682
- (4): Sequential(
683
- (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16)
684
- (1): LeakyReLU(negative_slope=0.1)
685
- )
686
- (5): Sequential(
687
- (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16)
688
- (1): LeakyReLU(negative_slope=0.1)
689
- )
690
- (6): Sequential(
691
- (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,))
692
- (1): LeakyReLU(negative_slope=0.1)
693
- )
694
- (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,))
695
- )
696
- )
697
- )
698
- )
699
- (mpd): HiFiGANMultiPeriodDiscriminator(
700
- (discriminators): ModuleList(
701
- (0): HiFiGANPeriodDiscriminator(
702
- (convs): ModuleList(
703
- (0): Sequential(
704
- (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
705
- (1): LeakyReLU(negative_slope=0.1)
706
- )
707
- (1): Sequential(
708
- (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
709
- (1): LeakyReLU(negative_slope=0.1)
710
- )
711
- (2): Sequential(
712
- (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
713
- (1): LeakyReLU(negative_slope=0.1)
714
- )
715
- (3): Sequential(
716
- (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
717
- (1): LeakyReLU(negative_slope=0.1)
718
- )
719
- (4): Sequential(
720
- (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
721
- (1): LeakyReLU(negative_slope=0.1)
722
- )
723
- )
724
- (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
725
- )
726
- (1): HiFiGANPeriodDiscriminator(
727
- (convs): ModuleList(
728
- (0): Sequential(
729
- (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
730
- (1): LeakyReLU(negative_slope=0.1)
731
- )
732
- (1): Sequential(
733
- (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
734
- (1): LeakyReLU(negative_slope=0.1)
735
- )
736
- (2): Sequential(
737
- (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
738
- (1): LeakyReLU(negative_slope=0.1)
739
- )
740
- (3): Sequential(
741
- (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
742
- (1): LeakyReLU(negative_slope=0.1)
743
- )
744
- (4): Sequential(
745
- (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
746
- (1): LeakyReLU(negative_slope=0.1)
747
- )
748
- )
749
- (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
750
- )
751
- (2): HiFiGANPeriodDiscriminator(
752
- (convs): ModuleList(
753
- (0): Sequential(
754
- (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
755
- (1): LeakyReLU(negative_slope=0.1)
756
- )
757
- (1): Sequential(
758
- (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
759
- (1): LeakyReLU(negative_slope=0.1)
760
- )
761
- (2): Sequential(
762
- (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
763
- (1): LeakyReLU(negative_slope=0.1)
764
- )
765
- (3): Sequential(
766
- (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
767
- (1): LeakyReLU(negative_slope=0.1)
768
- )
769
- (4): Sequential(
770
- (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
771
- (1): LeakyReLU(negative_slope=0.1)
772
- )
773
- )
774
- (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
775
- )
776
- (3): HiFiGANPeriodDiscriminator(
777
- (convs): ModuleList(
778
- (0): Sequential(
779
- (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
780
- (1): LeakyReLU(negative_slope=0.1)
781
- )
782
- (1): Sequential(
783
- (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
784
- (1): LeakyReLU(negative_slope=0.1)
785
- )
786
- (2): Sequential(
787
- (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
788
- (1): LeakyReLU(negative_slope=0.1)
789
- )
790
- (3): Sequential(
791
- (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
792
- (1): LeakyReLU(negative_slope=0.1)
793
- )
794
- (4): Sequential(
795
- (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
796
- (1): LeakyReLU(negative_slope=0.1)
797
- )
798
- )
799
- (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
800
- )
801
- (4): HiFiGANPeriodDiscriminator(
802
- (convs): ModuleList(
803
- (0): Sequential(
804
- (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
805
- (1): LeakyReLU(negative_slope=0.1)
806
- )
807
- (1): Sequential(
808
- (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
809
- (1): LeakyReLU(negative_slope=0.1)
810
- )
811
- (2): Sequential(
812
- (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
813
- (1): LeakyReLU(negative_slope=0.1)
814
- )
815
- (3): Sequential(
816
- (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0))
817
- (1): LeakyReLU(negative_slope=0.1)
818
- )
819
- (4): Sequential(
820
- (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
821
- (1): LeakyReLU(negative_slope=0.1)
822
- )
823
- )
824
- (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0))
825
- )
826
- )
827
- )
828
- )
829
- (generator_adv_loss): GeneratorAdversarialLoss()
830
- (discriminator_adv_loss): DiscriminatorAdversarialLoss()
831
- (feat_match_loss): FeatureMatchLoss()
832
- (mel_loss): MelSpectrogramLoss(
833
- (wav_to_mel): LogMelFbank(
834
- (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
835
- (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False)
836
- )
837
- )
838
- (var_loss): VarianceLoss(
839
- (mse_criterion): MSELoss()
840
- (duration_criterion): DurationPredictorLoss(
841
- (criterion): MSELoss()
842
- )
843
- )
844
- (forwardsum_loss): ForwardSumLoss()
845
- )
846
- )
847
-
848
- Model summary:
849
- Class Name: ESPnetGANTTSModel
850
- Total Number of model parameters: 83.28 M
851
- Number of trainable parameters: 83.28 M (100.0%)
852
- Size: 333.11 MB
853
- Type: torch.float32
854
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer:
855
- AdamW (
856
- Parameter Group 0
857
- amsgrad: False
858
- betas: [0.8, 0.99]
859
- eps: 1e-09
860
- initial_lr: 0.0002
861
- lr: 0.0002
862
- weight_decay: 0.0
863
- )
864
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f5660199550>
865
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer2:
866
- AdamW (
867
- Parameter Group 0
868
- amsgrad: False
869
- betas: [0.8, 0.99]
870
- eps: 1e-09
871
- initial_lr: 0.0002
872
- lr: 0.0002
873
- weight_decay: 0.0
874
- )
875
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler2: <torch.optim.lr_scheduler.ExponentialLR object at 0x7f5747efa9d0>
876
- [92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1171) INFO: Saving the configuration in exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml
877
- [92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1525) INFO: [train] dataset:
878
- ESPnetDataset(
879
- text: {"path": "dump/raw/jvs010_tr_no_dev/text", "type": "text"}
880
- speech: {"path": "dump/raw/jvs010_tr_no_dev/wav.scp", "type": "sound"}
881
- preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660199dc0>)
882
- [92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1526) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=4, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending)
883
- [92b100c97f43:0/4] 2025-03-04 21:23:54,699 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=4, mean=25.0, min=5, max=41
884
- [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1525) INFO: [valid] dataset:
885
- ESPnetDataset(
886
- text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
887
- speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
888
- preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660199520>)
889
- [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1526) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=1, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending)
890
- [92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=1, mean=15.0, min=15, max=15
891
- [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1525) INFO: [plot_att] dataset:
892
- ESPnetDataset(
893
- text: {"path": "dump/raw/jvs010_dev/text", "type": "text"}
894
- speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"}
895
- preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7f5660155130>)
896
- [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15, batch_size=1, key_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn,
897
- [92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1
898
- 92b100c97f43:1159464:1159464 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
899
- 92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
900
-
901
- 92b100c97f43:1159464:1159464 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
902
- 92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
903
- 92b100c97f43:1159464:1159464 [0] NCCL INFO Using network Socket
904
- NCCL version 2.10.3+cuda11.3
905
- 92b100c97f43:1159466:1159466 [2] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
906
- 92b100c97f43:1159465:1159465 [1] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
907
- 92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
908
- 92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
909
-
910
- 92b100c97f43:1159466:1159466 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
911
-
912
- 92b100c97f43:1159465:1159465 [1] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
913
- 92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
914
- 92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
915
- 92b100c97f43:1159465:1159465 [1] NCCL INFO Using network Socket
916
- 92b100c97f43:1159466:1159466 [2] NCCL INFO Using network Socket
917
- 92b100c97f43:1159467:1159467 [3] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0>
918
- 92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
919
-
920
- 92b100c97f43:1159467:1159467 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
921
- 92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0>
922
- 92b100c97f43:1159467:1159467 [3] NCCL INFO Using network Socket
923
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00/02 : 0 1 2 3
924
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
925
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
926
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01/02 : 0 1 2 3
927
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
928
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff
929
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
930
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff
931
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff
932
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff
933
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 3[60] via direct shared memory
934
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 0[30] via direct shared memory
935
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00 : 0[30] -> 1[40] via direct shared memory
936
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 2[50] via direct shared memory
937
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 3[60] via direct shared memory
938
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 0[30] via direct shared memory
939
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01 : 0[30] -> 1[40] via direct shared memory
940
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 2[50] via direct shared memory
941
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all rings
942
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all rings
943
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all rings
944
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all rings
945
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 2[50] via direct shared memory
946
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 2[50] via direct shared memory
947
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 1[40] via direct shared memory
948
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 0[30] via direct shared memory
949
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 1[40] via direct shared memory
950
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 0[30] via direct shared memory
951
- 92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all trees
952
- 92b100c97f43:1159464:1159502 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
953
- 92b100c97f43:1159464:1159502 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
954
- 92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all trees
955
- 92b100c97f43:1159467:1159505 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
956
- 92b100c97f43:1159467:1159505 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
957
- 92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all trees
958
- 92b100c97f43:1159465:1159504 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
959
- 92b100c97f43:1159465:1159504 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
960
- 92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all trees
961
- 92b100c97f43:1159466:1159503 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
962
- 92b100c97f43:1159466:1159503 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
963
- 92b100c97f43:1159466:1159503 [2] NCCL INFO comm 0x7f35f80030d0 rank 2 nranks 4 cudaDev 2 busId 50 - Init COMPLETE
964
- 92b100c97f43:1159464:1159502 [0] NCCL INFO comm 0x7f55500030d0 rank 0 nranks 4 cudaDev 0 busId 30 - Init COMPLETE
965
- 92b100c97f43:1159464:1159464 [0] NCCL INFO Launch mode Parallel
966
- 92b100c97f43:1159465:1159504 [1] NCCL INFO comm 0x7f97600030d0 rank 1 nranks 4 cudaDev 1 busId 40 - Init COMPLETE
967
- 92b100c97f43:1159467:1159505 [3] NCCL INFO comm 0x7f66b80030d0 rank 3 nranks 4 cudaDev 3 busId 60 - Init COMPLETE
968
- /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
969
- warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
970
- /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
971
- warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
972
- [92b100c97f43:0/4] 2025-03-04 21:23:55,188 (trainer:280) INFO: 1/130epoch started
973
- /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
974
- warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
975
- [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
976
- [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
977
- [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
978
- [W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
979
- /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
980
- olens = (ilens - self.n_fft) // self.hop_length + 1
981
- /work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
982
- olens = (ilens - self.n_fft) // self.hop_length + 1
983
- [92b100c97f43:0/4] 2025-03-04 21:25:36,523 (gan_trainer:305) INFO: 1epoch:train:1-50batch: iter_time=0.048, generator_forward_time=0.723, generator_loss=139.743, generator_g_loss=110.582, generator_var_loss=5.224, generator_align_loss=23.937, generator_g_mel_loss=106.758, generator_g_adv_loss=2.179, generator_g_feat_match_loss=1.645, generator_var_dur_loss=0.584, generator_var_pitch_loss=2.400, generator_var_energy_loss=2.240, generator_align_forwardsum_loss=10.599, generator_align_bin_loss=1.369, generator_backward_time=0.254, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.113, discriminator_forward_time=0.544, discriminator_loss=2.766, discriminator_real_loss=1.518, discriminator_fake_loss=1.247, discriminator_backward_time=0.198, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.788, train_time=2.024
984
- [92b100c97f43:0/4] 2025-03-04 21:27:08,245 (gan_trainer:305) INFO: 1epoch:train:51-100batch: iter_time=1.198e-04, generator_forward_time=0.634, generator_loss=111.648, generator_g_loss=85.935, generator_var_loss=2.179, generator_align_loss=23.534, generator_g_mel_loss=80.251, generator_g_adv_loss=2.332, generator_g_feat_match_loss=3.352, generator_var_dur_loss=0.089, generator_var_pitch_loss=0.924, generator_var_energy_loss=1.166, generator_align_forwardsum_loss=10.437, generator_align_bin_loss=1.330, generator_backward_time=0.258, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.027, discriminator_forward_time=0.548, discriminator_loss=2.396, discriminator_real_loss=1.381, discriminator_fake_loss=1.015, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.796, train_time=1.835
985
- [92b100c97f43:0/4] 2025-03-04 21:28:38,897 (gan_trainer:305) INFO: 1epoch:train:101-150batch: iter_time=1.203e-04, generator_forward_time=0.624, generator_loss=112.406, generator_g_loss=87.597, generator_var_loss=1.890, generator_align_loss=22.919, generator_g_mel_loss=80.508, generator_g_adv_loss=2.744, generator_g_feat_match_loss=4.346, generator_var_dur_loss=0.058, generator_var_pitch_loss=0.808, generator_var_energy_loss=1.024, generator_align_forwardsum_loss=10.071, generator_align_bin_loss=1.389, generator_backward_time=0.257, generator_optim_step_time=0.033, optim0_lr0=2.000e-04, generator_train_time=1.015, discriminator_forward_time=0.539, discriminator_loss=2.084, discriminator_real_loss=1.319, discriminator_fake_loss=0.765, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.787, train_time=1.813
 
1
  # python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
2
+ # Started at Tue Mar 4 22:09:50 JST 2025
3
  #
4
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
5
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
6
  /usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True
7
  /usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version!
8
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "
9
+ [92b100c97f43:0/4] 2025-03-04 22:09:57,737 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
10
+ [92b100c97f43:0/4] 2025-03-04 22:09:57,738 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.
11
+ [92b100c97f43:0/4] 2025-03-04 22:09:57,793 (gan_tts:304) INFO: Vocabulary size: 41
12
+ [92b100c97f43:0/4] 2025-03-04 22:09:57,962 (encoder:172) INFO: encoder self-attention layer type = self-attention
13
+ [92b100c97f43:0/4] 2025-03-04 22:09:58,184 (encoder:172) INFO: encoder self-attention layer type = self-attention