diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1.log new file mode 100644 index 0000000000000000000000000000000000000000..3677f187681d45fa3f45d989f654a75894d59f4d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,215 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,440 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,563 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,676 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,686 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,705 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2054db771aadc4baad84c61353ec3c770e2791c7 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ba2a47d50a14b50e54dd6e7d6ef906a5e548314c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71835ae88e42963ae5e3de88a9c34a4e434c2dd2662455b4e61f347e2bd7f6f2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..3d1904c0495a24cabae8d9e97172326bad59f81a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5541947d077b0041a3487937ee9145a25d632bc5bbd43711f977b48ac2569c6b +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ba2a47d50a14b50e54dd6e7d6ef906a5e548314c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71835ae88e42963ae5e3de88a9c34a4e434c2dd2662455b4e61f347e2bd7f6f2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..d16bf6ab433f43fc292714a6f3b89b13c4accc1c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc5a955096b7b154b0e20365fd4a1cae7cc6fdf0e5cc0181abc8ca0dbba542af +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ba2a47d50a14b50e54dd6e7d6ef906a5e548314c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71835ae88e42963ae5e3de88a9c34a4e434c2dd2662455b4e61f347e2bd7f6f2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..025d98c772b400feda42929f425410c0e88675fa --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802e7de46e59805b1ba5263f3208ccfc91ab4cd6c1ca757b7ec507f8f3d07bcc +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3d4a8ba4580c6f4d2dee2822f18746ab3a4bbd0a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/speech_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_001 178800 +jvs010_VOICEACTRESS100_002 198240 +jvs010_VOICEACTRESS100_003 128640 +jvs010_VOICEACTRESS100_004 132000 +jvs010_VOICEACTRESS100_005 277440 +jvs010_VOICEACTRESS100_006 94560 +jvs010_VOICEACTRESS100_007 182160 +jvs010_VOICEACTRESS100_008 180960 +jvs010_VOICEACTRESS100_009 145920 +jvs010_VOICEACTRESS100_010 105359 +jvs010_VOICEACTRESS100_011 148080 +jvs010_VOICEACTRESS100_012 130320 +jvs010_VOICEACTRESS100_013 117839 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f2bb1ed39dbd114a95e5e395948c056cc5c40441 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/train/text_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_001 78 +jvs010_VOICEACTRESS100_002 91 +jvs010_VOICEACTRESS100_003 69 +jvs010_VOICEACTRESS100_004 64 +jvs010_VOICEACTRESS100_005 121 +jvs010_VOICEACTRESS100_006 49 +jvs010_VOICEACTRESS100_007 93 +jvs010_VOICEACTRESS100_008 82 +jvs010_VOICEACTRESS100_009 69 +jvs010_VOICEACTRESS100_010 49 +jvs010_VOICEACTRESS100_011 77 +jvs010_VOICEACTRESS100_012 55 +jvs010_VOICEACTRESS100_013 54 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..30a8d9ad9b045fc992e070b71e19a47c1fa82305 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149732b1e4eb869bc4896e15295f8daa910f81fa02a9f50c1821473ebedc9e3f +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..22eab2573b8727db77209131464352da5511053d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4931a92d9fcb8b8b598841c3589ac0e11b838be56304fd37301a3fc02a30bff5 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..30a8d9ad9b045fc992e070b71e19a47c1fa82305 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149732b1e4eb869bc4896e15295f8daa910f81fa02a9f50c1821473ebedc9e3f +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..0abae07fe3c8bbcaec32758a898d749b3faaee0e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc879120415bfea6b9b8b9559285f9b75cc8881682d326559b920626c778332e +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..30a8d9ad9b045fc992e070b71e19a47c1fa82305 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149732b1e4eb869bc4896e15295f8daa910f81fa02a9f50c1821473ebedc9e3f +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bbdca7f68834f24add9196c8a0fd6bd314673498 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e277313a3d955098f9f250192f18bce2b31d2a442dc9e8a3ac909b9238e66c1 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..ff1492d14a19dd3adb0459fcf185bc2db205c8de --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0113 143040 +jvs010_BASIC5000_0261 77520 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..cc42643ddec02903cb06a8acdcca44837e566e0a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.1/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0113 56 +jvs010_BASIC5000_0261 39 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2.log new file mode 100644 index 0000000000000000000000000000000000000000..8b56b4647b78b50985dbba1b6756f722cee97822 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,163 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,507 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,643 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,653 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,654 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,672 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3cb770b9f8d7959bfe349d6b805543f55423593c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deaa69aaeebc1a10e996a8443e6f2ead241a62da --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe9bc4321cc000a1484565eb1b696bf9c1ed193a806ec8ebf606d4207b939d3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..da82e8e3204c9c77f9c312967a70c1c92ad0a132 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd01373077c58b6e8e5e20227f37180f96d8717bdf9c4e0bd17b93206fcd5a6 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deaa69aaeebc1a10e996a8443e6f2ead241a62da --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe9bc4321cc000a1484565eb1b696bf9c1ed193a806ec8ebf606d4207b939d3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b96b7e2a760b55622c225c5d3421b620bf90a410 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0205589d93485e825bd25b9c6b5e1714ca6dae977b3f555591874bacaffed623 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deaa69aaeebc1a10e996a8443e6f2ead241a62da --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efe9bc4321cc000a1484565eb1b696bf9c1ed193a806ec8ebf606d4207b939d3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e08be0fdb03c5cd8ed9bd42ef6d0072be8e377d2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a418a92a86d6891b0a6ec4fbd84592a5df85011ea03995a231bb6e60f830f9ce +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..13dc948be125cbc361802199b9d46dd7c5992be0 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/speech_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_014 56160 +jvs010_VOICEACTRESS100_015 104640 +jvs010_VOICEACTRESS100_016 122880 +jvs010_VOICEACTRESS100_017 148080 +jvs010_VOICEACTRESS100_018 161041 +jvs010_VOICEACTRESS100_019 185040 +jvs010_VOICEACTRESS100_020 162000 +jvs010_VOICEACTRESS100_021 154320 +jvs010_VOICEACTRESS100_022 314880 +jvs010_VOICEACTRESS100_023 150480 +jvs010_VOICEACTRESS100_024 197040 +jvs010_VOICEACTRESS100_025 98880 +jvs010_VOICEACTRESS100_026 89040 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..dfc9a41f1229d3dda73b5f051b5bd3331c5f8a12 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/train/text_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_014 33 +jvs010_VOICEACTRESS100_015 47 +jvs010_VOICEACTRESS100_016 62 +jvs010_VOICEACTRESS100_017 60 +jvs010_VOICEACTRESS100_018 78 +jvs010_VOICEACTRESS100_019 87 +jvs010_VOICEACTRESS100_020 80 +jvs010_VOICEACTRESS100_021 67 +jvs010_VOICEACTRESS100_022 124 +jvs010_VOICEACTRESS100_023 69 +jvs010_VOICEACTRESS100_024 86 +jvs010_VOICEACTRESS100_025 54 +jvs010_VOICEACTRESS100_026 40 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f36e827825686c8136e9dafa98843e8067f3eba2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04927f2b066ae5e9114b3f175a0890ad05a42039ba563ddf956476dde6575140 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..155148d9b2c30db426c844342e85ca95dee297d7 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a5191cf7f4b801f806ca1d7f5b73c5d530a273d73d64158189a4bf9b8c4494 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f36e827825686c8136e9dafa98843e8067f3eba2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04927f2b066ae5e9114b3f175a0890ad05a42039ba563ddf956476dde6575140 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1c70602b0b1c1dff823aab7468c4b745cc954387 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8766fdf1445b7fa6eaa6fffb246419dac1dac3315ee84b24a0546d58e5974aec +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f36e827825686c8136e9dafa98843e8067f3eba2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04927f2b066ae5e9114b3f175a0890ad05a42039ba563ddf956476dde6575140 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..077cd3f6f7e5088ed8c3f5a7671fb9e034540497 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fed5ab4f8c3b3f81c11011b7f6d1eb8c0435c4c5a56361938accde0be1f3740 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..377a9f3a6377173753a4b53f31f68fbe521bdf22 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0351 72720 +jvs010_BASIC5000_0882 91440 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..a82ac846fa3181e9de35cd2bb3d042eb6a840ab7 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.2/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0351 36 +jvs010_BASIC5000_0882 49 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3.log new file mode 100644 index 0000000000000000000000000000000000000000..61265242b3cca01fee57ca54319d8fae40bf9bd3 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,166 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,388 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,512 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,615 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,624 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,643 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47d3a57dbb6cc854182540cf574d9f8548feb31c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..3ac7df679e2a70a235aca4cbb0e8f3293f66d095 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6a03be1e1fb8a7ead4429f0b519db24a8ba45dab0a18a87835ea8affbc2e85 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..06c0e42289cb1546b45a024bb1ad25e8d6f986a2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff60189b8d9d8b107ce989ba373218b4d36d74fd9e5e3b44cd70e59d9d40c97 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..3ac7df679e2a70a235aca4cbb0e8f3293f66d095 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6a03be1e1fb8a7ead4429f0b519db24a8ba45dab0a18a87835ea8affbc2e85 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7afce48c4f5e4c624ab574c1253996563cd4d0e0 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07ebeb5a1a08f14af92b918c8b01d6e0cf7a5872b5fa058ccbf02427ba57d59 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..3ac7df679e2a70a235aca4cbb0e8f3293f66d095 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6a03be1e1fb8a7ead4429f0b519db24a8ba45dab0a18a87835ea8affbc2e85 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c5dafb51f323f67732e8dcff9bffb7332c94aeca --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fd696a948d2e63894da3adeb9c7d269c78c210cd5692bb8bdf01a0e3648a88a +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..8fc917a03b7cb92befe5c94e5bd91a1930cd1c70 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/speech_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_027 155280 +jvs010_VOICEACTRESS100_028 224879 +jvs010_VOICEACTRESS100_029 196320 +jvs010_VOICEACTRESS100_030 126000 +jvs010_VOICEACTRESS100_031 104160 +jvs010_VOICEACTRESS100_032 230400 +jvs010_VOICEACTRESS100_033 75840 +jvs010_VOICEACTRESS100_034 86880 +jvs010_VOICEACTRESS100_035 187200 +jvs010_VOICEACTRESS100_036 176640 +jvs010_VOICEACTRESS100_037 131520 +jvs010_VOICEACTRESS100_038 160800 +jvs010_VOICEACTRESS100_039 154320 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..6c07191b7696321de5e5d653a3fe9f8f70e8890f --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/train/text_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_027 69 +jvs010_VOICEACTRESS100_028 101 +jvs010_VOICEACTRESS100_029 93 +jvs010_VOICEACTRESS100_030 61 +jvs010_VOICEACTRESS100_031 43 +jvs010_VOICEACTRESS100_032 105 +jvs010_VOICEACTRESS100_033 38 +jvs010_VOICEACTRESS100_034 42 +jvs010_VOICEACTRESS100_035 78 +jvs010_VOICEACTRESS100_036 77 +jvs010_VOICEACTRESS100_037 55 +jvs010_VOICEACTRESS100_038 74 +jvs010_VOICEACTRESS100_039 63 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..6ae4cd9041861037f6f56a77ec0362fa94119b62 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605694c0deed61a26e3bf279ad42f3059981b0ad031a9fc6bccfc82f72341bef +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4472a7ec8d8070ec48b940acbe3d22dbf7199a84 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dce00b7fbc18a4daf78f2efd1397b5ab0df26f80b75b5abe07f4c9f646f5149 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..6ae4cd9041861037f6f56a77ec0362fa94119b62 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605694c0deed61a26e3bf279ad42f3059981b0ad031a9fc6bccfc82f72341bef +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f4632cf2d38819a4c5e244ee35a0b34c1dbf115a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45d9f43e18879182f848ab31ac8ebf4b6f40e9b2644e10e96104671fffc37b2e +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..6ae4cd9041861037f6f56a77ec0362fa94119b62 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605694c0deed61a26e3bf279ad42f3059981b0ad031a9fc6bccfc82f72341bef +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..6458436c93c3ef8e986672557f02d7a23df4c68c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4915b8359ebb342c1a79ee20f1bfdb874c868f927dc10b026cef206d58ed5703 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..a611a381ca6dc01fca15c869dca2fa77dfdf0587 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1009 73920 +jvs010_BASIC5000_1087 182160 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..5a4acc836aa4a1bd152c2aa16479fdcd07f00d3c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.3/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1009 38 +jvs010_BASIC5000_1087 96 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4.log new file mode 100644 index 0000000000000000000000000000000000000000..606c0b9f63d3e144c43575f56b984f38e524c15b --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,214 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,437 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,561 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,634 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,644 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,645 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,663 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc5fea9a4535297c86d64d961a25454a589e3038 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..37ffbecfbba941c11dd728da4bfab9a5d33a3501 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0baa2e4acb2b91f7df79a1bf183612cf9397922688dddf2dd6383018253285 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c58aab04268fde852263d14e52b659063768a09f --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd9b817b40bf76f00d879af1d8619c6058039e3c4d38cc203ff66f8d3eddf2cb +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..37ffbecfbba941c11dd728da4bfab9a5d33a3501 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0baa2e4acb2b91f7df79a1bf183612cf9397922688dddf2dd6383018253285 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e52fdf7c88b751cbbc825cab0db126f417c28498 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be82cc7cb74f82c01c6fbd30a1108bfcc22643236d3be0d79099fa0d2166ff4d +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..37ffbecfbba941c11dd728da4bfab9a5d33a3501 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0baa2e4acb2b91f7df79a1bf183612cf9397922688dddf2dd6383018253285 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b037c62e10eb6d13ac9c329e85cc82ea54c64bbe --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805450587f459fb7e121dee381c85602f41e20ed9dd7f222f76e8b1a14ae76ae +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4e6ee4e8c93b145769da8f84ba6aa9c450d7df93 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/speech_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_040 100560 +jvs010_VOICEACTRESS100_041 137040 +jvs010_VOICEACTRESS100_042 102000 +jvs010_VOICEACTRESS100_043 168480 +jvs010_VOICEACTRESS100_044 228240 +jvs010_VOICEACTRESS100_045 205679 +jvs010_VOICEACTRESS100_046 133200 +jvs010_VOICEACTRESS100_047 190080 +jvs010_VOICEACTRESS100_048 194160 +jvs010_VOICEACTRESS100_049 138000 +jvs010_VOICEACTRESS100_050 182160 +jvs010_VOICEACTRESS100_051 113520 +jvs010_VOICEACTRESS100_052 201840 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..41f8983f1452e01eb1c67760846ae554906db121 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/train/text_shape @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_040 42 +jvs010_VOICEACTRESS100_041 69 +jvs010_VOICEACTRESS100_042 43 +jvs010_VOICEACTRESS100_043 75 +jvs010_VOICEACTRESS100_044 91 +jvs010_VOICEACTRESS100_045 84 +jvs010_VOICEACTRESS100_046 52 +jvs010_VOICEACTRESS100_047 81 +jvs010_VOICEACTRESS100_048 87 +jvs010_VOICEACTRESS100_049 64 +jvs010_VOICEACTRESS100_050 88 +jvs010_VOICEACTRESS100_051 53 +jvs010_VOICEACTRESS100_052 85 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8b1b4cefbe4e70a9f59a1622733a14da74c66cbc --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8778670ab81c62998cbd1900a6ea4215d869774a5d8dabf1455852392ea18a0b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..31bfcaabb0c5b727797bde9039f3460677504834 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6d35d44b17bc53214b9537753df058870f72b782237b8dcf5838ec9fbc5fff +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8b1b4cefbe4e70a9f59a1622733a14da74c66cbc --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8778670ab81c62998cbd1900a6ea4215d869774a5d8dabf1455852392ea18a0b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b7aaca4bd6b019d2b221d955ac1c1f295fbb56df --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d25276ce9fe2fdc82d4d140748fbd02c448dbd03003bf2f08d759ce658e638f +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8b1b4cefbe4e70a9f59a1622733a14da74c66cbc --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8778670ab81c62998cbd1900a6ea4215d869774a5d8dabf1455852392ea18a0b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4a3e4b62b239c812834c667f427d7dae353c7ce1 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d129fef04ba3617a016e909803460c85924dc7a7c90225d025225e85b059fd +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..6b8f197375ed24fc86eec462c5853af865192f94 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1122 108000 +jvs010_BASIC5000_1274 67680 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..be91075b917fa8ff80996415c1d1fa9081198834 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.4/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1122 54 +jvs010_BASIC5000_1274 35 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5.log new file mode 100644 index 0000000000000000000000000000000000000000..5408870bccc12124ba6bd1d8a74e5c0e7c3affa1 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,507 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,547 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,558 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,577 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e468cdd39e39b60cdea92789d2366a4e3ec7b4e6 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e6f410dc379f4ba0cafc4afb9c8c20c45a82e00a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de397d7857f6f0f0c36a98e0f15f314eece7bf81d5c8e5df7812c3abc31645b3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a7cc907d5ac772b6e0c80ec6145836ea24f2b6db --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae21518e7ca599bb7e9c7c93330d64963490e6a941a92e63a04e075c0fb61fc6 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e6f410dc379f4ba0cafc4afb9c8c20c45a82e00a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de397d7857f6f0f0c36a98e0f15f314eece7bf81d5c8e5df7812c3abc31645b3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b6ac02260f56b31fdb083ebd97cc26c53c3d97b8 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ae2da254f02b937942311d6e659339ec78e7c3f32b23844757299eda6b71c1 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e6f410dc379f4ba0cafc4afb9c8c20c45a82e00a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de397d7857f6f0f0c36a98e0f15f314eece7bf81d5c8e5df7812c3abc31645b3 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e42fc28f9d947844131646e255d9efde476f3bd0 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1ac772b244e4b8068119740267a6c4f503f86fe966fc31b693b20eb4a16bba0 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..e4ba5cf595d8899d372b1c04098d9a6b4f459ed5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/speech_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_053 132000 +jvs010_VOICEACTRESS100_054 294960 +jvs010_VOICEACTRESS100_055 75360 +jvs010_VOICEACTRESS100_056 188880 +jvs010_VOICEACTRESS100_057 113520 +jvs010_VOICEACTRESS100_058 132720 +jvs010_VOICEACTRESS100_059 145680 +jvs010_VOICEACTRESS100_060 146881 +jvs010_VOICEACTRESS100_061 165600 +jvs010_VOICEACTRESS100_062 160080 +jvs010_VOICEACTRESS100_063 93600 +jvs010_VOICEACTRESS100_064 119520 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..2f139328f11c27773ce06544bb15374ba591ccf7 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/train/text_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_053 62 +jvs010_VOICEACTRESS100_054 125 +jvs010_VOICEACTRESS100_055 40 +jvs010_VOICEACTRESS100_056 80 +jvs010_VOICEACTRESS100_057 60 +jvs010_VOICEACTRESS100_058 66 +jvs010_VOICEACTRESS100_059 71 +jvs010_VOICEACTRESS100_060 69 +jvs010_VOICEACTRESS100_061 83 +jvs010_VOICEACTRESS100_062 76 +jvs010_VOICEACTRESS100_063 47 +jvs010_VOICEACTRESS100_064 60 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..500b7ee0a474d82a519ac4f7b29151759a97db78 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d84a501cd8efe7c21dd5adac812bee102640ca2c7f5fce416ba60a03b41b7b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..0a98a7c32509be266f3bfb3fa7abf0fb642dda81 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2467ffa78163846667a201fcf18be3a80463edc02336b29082083bb25354b188 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..500b7ee0a474d82a519ac4f7b29151759a97db78 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d84a501cd8efe7c21dd5adac812bee102640ca2c7f5fce416ba60a03b41b7b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..0b39234c4c8dcbb7070406f19ad4c7eeec3e08ee --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69b8158170883b20faf589a5ff3fb13dffea911b8b176d3f7b8317316713e181 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..500b7ee0a474d82a519ac4f7b29151759a97db78 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d84a501cd8efe7c21dd5adac812bee102640ca2c7f5fce416ba60a03b41b7b +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..046b05bf30b33ffcd133fb9656a50799b1ad3c8b --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b10cb69b8861e42094fd9631d2ff4aa8be7b54dd3020d20fbdeb4a3f80e64b5d +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..99744ecf0bed163ef0f86c278f14f7977876543f --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1328 94319 +jvs010_BASIC5000_1355 102720 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..bc40581db043a7976468248aa88bdca24a5be388 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.5/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1328 49 +jvs010_BASIC5000_1355 45 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6.log new file mode 100644 index 0000000000000000000000000000000000000000..2fe191208b5e5981e871d300f780766bf05ab9ec --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,383 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,505 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,676 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,687 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,706 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41b1631e65632d7ed5312849d3d39b0fa6dd9f0b --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f3b3648df338124540c28d0cae1f61a4159492fe --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03675b09633548f8e1a52e8d396a57a070fa4dbff20229814ea347e531fa44e +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..cddbeada4ba817a4c5371a253ee43396ba3f9e62 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8212de35300e91d2272b673f50d4fe8ee5e94d66b7d3cd85a986b23c0f536ae +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f3b3648df338124540c28d0cae1f61a4159492fe --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03675b09633548f8e1a52e8d396a57a070fa4dbff20229814ea347e531fa44e +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bd4d07fb54db2fe6da8b73259542dfced8ceed9c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a41768ca4d64367a01487a289a8a715bf4e7fccfe7363d028986cbdee54fbb +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f3b3648df338124540c28d0cae1f61a4159492fe --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03675b09633548f8e1a52e8d396a57a070fa4dbff20229814ea347e531fa44e +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..9d80fbbb8e30c43fa7aa810c5a65a4fb935cc3df --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d0f431312fdf1e209f94866c49d33ce433a40e712b3ea2e07a9b647f1fd1fa +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4f0ba6294d9c4c9a600640700a4b4b7d6b0f770e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/speech_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_065 193200 +jvs010_VOICEACTRESS100_066 171360 +jvs010_VOICEACTRESS100_067 157440 +jvs010_VOICEACTRESS100_068 135840 +jvs010_VOICEACTRESS100_069 218400 +jvs010_VOICEACTRESS100_070 163200 +jvs010_VOICEACTRESS100_071 147120 +jvs010_VOICEACTRESS100_072 156960 +jvs010_VOICEACTRESS100_073 153600 +jvs010_VOICEACTRESS100_074 120960 +jvs010_VOICEACTRESS100_075 155280 +jvs010_VOICEACTRESS100_076 242640 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..b50c1fd8d09c230ad71fd0b516055b3674f492e5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/train/text_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_065 101 +jvs010_VOICEACTRESS100_066 77 +jvs010_VOICEACTRESS100_067 74 +jvs010_VOICEACTRESS100_068 63 +jvs010_VOICEACTRESS100_069 103 +jvs010_VOICEACTRESS100_070 81 +jvs010_VOICEACTRESS100_071 68 +jvs010_VOICEACTRESS100_072 73 +jvs010_VOICEACTRESS100_073 67 +jvs010_VOICEACTRESS100_074 54 +jvs010_VOICEACTRESS100_075 70 +jvs010_VOICEACTRESS100_076 89 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deb9caeb88c470b1be393065087a06089e543349 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af3624d2a1ac0ffe447035740d957dec92d99ce11d994941deeda1657b6de132 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a975e208f771211bafe93b1546b85d9ca1563185 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2a58c75a8df667d987616f3c97ee9bccde96af15cca23e595fe613f28fdb640 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deb9caeb88c470b1be393065087a06089e543349 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af3624d2a1ac0ffe447035740d957dec92d99ce11d994941deeda1657b6de132 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..9df1af04a7b2df85470b75b766b6ab349fec867f --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28986a4fe01e77213e716478d3769d6f9e1b5f45a2aab14350e3e37b08682932 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..deb9caeb88c470b1be393065087a06089e543349 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af3624d2a1ac0ffe447035740d957dec92d99ce11d994941deeda1657b6de132 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b21acbcddf20975f7fa88cb88d5855c528b7984b --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b9d4178fa31f125e02eadfac46fd95864c0b649561ac3fe3ae3f781218c668 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3ce94bedb0ad535d9e43671f55f83de86471bfdd --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1474 87119 +jvs010_BASIC5000_1559 177120 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..202215e8a0d20ab16fe8c4849c82cfc1e86a7be7 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.6/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1474 47 +jvs010_BASIC5000_1559 82 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7.log new file mode 100644 index 0000000000000000000000000000000000000000..b5dd7afa47c8d5a8aba075677386cfadbc58bcbe --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,508 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,651 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,661 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,680 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7fa0b190584e612bccd63781d33f5118f6eafc9d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c1f694d377d36ed3dab0ca1cbf3479fc789adef2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4670fc9ad4a6a7c89aef3fb67fe10908b3c434f9e4c4fa3ec51718cf6cae18 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..310562bd326bb3315145ee24664804bda86430a9 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1bb6074dde158b2372b46aa0b52e9e5adac88d47d9c36e8f62a0a2a751be137 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c1f694d377d36ed3dab0ca1cbf3479fc789adef2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4670fc9ad4a6a7c89aef3fb67fe10908b3c434f9e4c4fa3ec51718cf6cae18 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c47e23f500a51ad5517acb618a894abe9371c673 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf32fc17384c6b34dfd2ba0d4805d49d2cc48c8419caf126f9131fb0b55b198 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c1f694d377d36ed3dab0ca1cbf3479fc789adef2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed4670fc9ad4a6a7c89aef3fb67fe10908b3c434f9e4c4fa3ec51718cf6cae18 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..2b9f644bef8f79c103aa0bba0e249fa96acc37fd --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9da6886ddcd52dbb7a3049d8ae32381177326dc56f138d5635e44bf55f786fb6 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..f7bc72bb05e7987e6244ee04431cfe08f8abc052 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/speech_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_077 216719 +jvs010_VOICEACTRESS100_078 238800 +jvs010_VOICEACTRESS100_079 196320 +jvs010_VOICEACTRESS100_080 158640 +jvs010_VOICEACTRESS100_081 207360 +jvs010_VOICEACTRESS100_082 148560 +jvs010_VOICEACTRESS100_083 145200 +jvs010_VOICEACTRESS100_084 181440 +jvs010_VOICEACTRESS100_085 258480 +jvs010_VOICEACTRESS100_086 178320 +jvs010_VOICEACTRESS100_087 231840 +jvs010_VOICEACTRESS100_088 156960 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..b1090321b4c48540a2444ae1c5589904ccfa46d2 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/train/text_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_077 91 +jvs010_VOICEACTRESS100_078 101 +jvs010_VOICEACTRESS100_079 88 +jvs010_VOICEACTRESS100_080 77 +jvs010_VOICEACTRESS100_081 95 +jvs010_VOICEACTRESS100_082 75 +jvs010_VOICEACTRESS100_083 61 +jvs010_VOICEACTRESS100_084 86 +jvs010_VOICEACTRESS100_085 111 +jvs010_VOICEACTRESS100_086 70 +jvs010_VOICEACTRESS100_087 101 +jvs010_VOICEACTRESS100_088 65 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ccc6e48f62d9a37966047a78791fdefb800a634d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63acd6bed570179d78045fe800925932df2acbe0b07af11a711a94ca31472c3c +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..daa349df57611aef91ab0c0b1a9b12ee0370129a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e773ed02ada40cf2f29eabd2dffa2b532681e0674f97baed5173537099e733ce +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ccc6e48f62d9a37966047a78791fdefb800a634d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63acd6bed570179d78045fe800925932df2acbe0b07af11a711a94ca31472c3c +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..dbf2708ee784ce5abab743c9f7ae4018f8530a11 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b05bf721aa43e169d0738332a7af0e8baa6538a57286464d5f57d2a81b43cc9 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ccc6e48f62d9a37966047a78791fdefb800a634d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63acd6bed570179d78045fe800925932df2acbe0b07af11a711a94ca31472c3c +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7c157ef6be8d9b0c707fea6540a77e383ab96608 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa312ba2ce9f1ec65c7166b0dc997ff7d24ac348ba908847d71b4490c624ef9 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..24527f7c7f035326c748854f469861a37121cb0c --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/speech_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1645 131280 +jvs010_BASIC5000_1849 205440 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..c079cafe5d550c677d3e00c288de5eb6c226cc84 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.7/valid/text_shape @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1645 64 +jvs010_BASIC5000_1849 81 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8.log b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8.log new file mode 100644 index 0000000000000000000000000000000000000000..7eea6855f9865f576166b064fbcd4afab61a4847 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8.log @@ -0,0 +1,874 @@ +# python3 -m espnet2.bin.gan_tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +# Started at Tue Mar 4 21:23:26 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 +[92b100c97f43] 2025-03-04 21:23:29,162 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43] 2025-03-04 21:23:29,385 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:29,508 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43] 2025-03-04 21:23:31,596 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (pitch_extract): Dio() + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1162) INFO: Scheduler: +[92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43] 2025-03-04 21:23:31,606 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43] 2025-03-04 21:23:31,607 (abs_task:1171) INFO: Saving the configuration in exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml +[92b100c97f43] 2025-03-04 21:23:31,625 (abs_task:1182) INFO: Namespace(accum_grad=1, allow_variable_data_keys=False, batch_bins=6000000, batch_size=20, batch_type='numel', best_model_criterion=[['valid', 'text2mel_loss', 'min'], ['train', 'text2mel_loss', 'min'], ['train', 'total_count', 'max']], bpemodel=None, chunk_length=500, chunk_shift_ratio=0.5, cleaner='jaconv', collect_stats=True, config='conf/tuning/train_jets.yaml', cudnn_benchmark=False, cudnn_deterministic=False, cudnn_enabled=True, detect_anomaly=False, dist_backend='nccl', dist_init_method='env://', dist_launcher=None, dist_master_addr=None, dist_master_port=None, dist_rank=None, dist_world_size=None, distributed=False, dry_run=False, early_stopping_criterion=('valid', 'loss', 'min'), energy_extract='energy', energy_extract_conf={'reduction_factor': 1, 'use_token_averaged_energy': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'win_length': 1200}, energy_normalize=None, energy_normalize_conf={}, feats_extract='fbank', feats_extract_conf={'n_fft': 2048, 'hop_length': 300, 'win_length': 1200, 'fs': 24000, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, fold_length=[], freeze_param=[], g2p='pyopenjtalk', generator_first=True, grad_clip=-1, grad_clip_type=2.0, grad_noise=False, ignore_init_mismatch=False, init_param=[], iterator_type='sequence', keep_nbest_models=-1, local_rank=None, log_interval=50, log_level='INFO', max_cache_fd=32, max_cache_size=0.0, max_epoch=130, model_conf={}, multiple_iterator=False, multiprocessing_distributed=False, nbest_averaging_interval=0, ngpu=0, no_forward_run=False, non_linguistic_symbols=None, normalize=None, normalize_conf={}, num_att_plot=3, num_cache_chunks=1024, num_iters_per_epoch=1000, num_workers=16, odim=None, optim='adamw', optim2='adamw', optim2_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, optim_conf={'lr': 0.0002, 'betas': [0.8, 0.99], 'eps': 1e-09, 'weight_decay': 0.0}, output_dir='exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8', patience=None, pitch_extract='dio', pitch_extract_conf={'reduction_factor': 1, 'use_token_averaged_f0': False, 'fs': 24000, 'n_fft': 2048, 'hop_length': 300, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, pretrain_path=None, print_config=False, required=['output_dir', 'token_list'], resume=False, scheduler='exponentiallr', scheduler2='exponentiallr', scheduler2_conf={'gamma': 0.999875}, scheduler_conf={'gamma': 0.999875}, seed=777, sharded_ddp=False, sort_batch='descending', sort_in_batch='descending', token_list=['', '', 'o', 'a', 'u', 'i', 'e', 'k', 'r', 't', 'n', 'pau', 'N', 's', 'sh', 'd', 'm', 'g', 'w', 'b', 'cl', 'I', 'j', 'ch', 'y', 'U', 'h', 'p', 'ts', 'f', 'z', 'ky', 'ny', 'gy', 'ry', 'hy', 'my', 'by', 'py', 'v', ''], token_type='phn', train_data_path_and_name_and_type=[('dump/raw/jvs010_tr_no_dev/text', 'text', 'text'), ('dump/raw/jvs010_tr_no_dev/wav.scp', 'speech', 'sound')], train_dtype='float32', train_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp'], tts='jets', tts_conf={'generator_type': 'jets_generator', 'generator_params': {'adim': 256, 'aheads': 2, 'elayers': 4, 'eunits': 1024, 'dlayers': 4, 'dunits': 1024, 'positionwise_layer_type': 'conv1d', 'positionwise_conv_kernel_size': 3, 'duration_predictor_layers': 2, 'duration_predictor_chans': 256, 'duration_predictor_kernel_size': 3, 'use_masking': True, 'encoder_normalize_before': True, 'decoder_normalize_before': True, 'encoder_type': 'transformer', 'decoder_type': 'transformer', 'conformer_rel_pos_type': 'latest', 'conformer_pos_enc_layer_type': 'rel_pos', 'conformer_self_attn_layer_type': 'rel_selfattn', 'conformer_activation_type': 'swish', 'use_macaron_style_in_conformer': True, 'use_cnn_in_conformer': True, 'conformer_enc_kernel_size': 7, 'conformer_dec_kernel_size': 31, 'init_type': 'xavier_uniform', 'transformer_enc_dropout_rate': 0.2, 'transformer_enc_positional_dropout_rate': 0.2, 'transformer_enc_attn_dropout_rate': 0.2, 'transformer_dec_dropout_rate': 0.2, 'transformer_dec_positional_dropout_rate': 0.2, 'transformer_dec_attn_dropout_rate': 0.2, 'pitch_predictor_layers': 5, 'pitch_predictor_chans': 256, 'pitch_predictor_kernel_size': 5, 'pitch_predictor_dropout': 0.5, 'pitch_embed_kernel_size': 1, 'pitch_embed_dropout': 0.0, 'stop_gradient_from_pitch_predictor': True, 'energy_predictor_layers': 2, 'energy_predictor_chans': 256, 'energy_predictor_kernel_size': 3, 'energy_predictor_dropout': 0.5, 'energy_embed_kernel_size': 1, 'energy_embed_dropout': 0.0, 'stop_gradient_from_energy_predictor': False, 'generator_out_channels': 1, 'generator_channels': 512, 'generator_global_channels': -1, 'generator_kernel_size': 7, 'generator_upsample_scales': [8, 8, 2, 2], 'generator_upsample_kernel_sizes': [16, 16, 4, 4], 'generator_resblock_kernel_sizes': [3, 7, 11], 'generator_resblock_dilations': [[1, 3, 5], [1, 3, 5], [1, 3, 5]], 'generator_use_additional_convs': True, 'generator_bias': True, 'generator_nonlinear_activation': 'LeakyReLU', 'generator_nonlinear_activation_params': {'negative_slope': 0.1}, 'generator_use_weight_norm': True, 'segment_size': 64, 'idim': 41, 'odim': 80}, 'discriminator_type': 'hifigan_multi_scale_multi_period_discriminator', 'discriminator_params': {'scales': 1, 'scale_downsample_pooling': 'AvgPool1d', 'scale_downsample_pooling_params': {'kernel_size': 4, 'stride': 2, 'padding': 2}, 'scale_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [15, 41, 5, 3], 'channels': 128, 'max_downsample_channels': 1024, 'max_groups': 16, 'bias': True, 'downsample_scales': [2, 2, 4, 4, 1], 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}, 'follow_official_norm': False, 'periods': [2, 3, 5, 7, 11], 'period_discriminator_params': {'in_channels': 1, 'out_channels': 1, 'kernel_sizes': [5, 3], 'channels': 32, 'downsample_scales': [3, 3, 3, 3, 1], 'max_downsample_channels': 1024, 'bias': True, 'nonlinear_activation': 'LeakyReLU', 'nonlinear_activation_params': {'negative_slope': 0.1}, 'use_weight_norm': True, 'use_spectral_norm': False}}, 'generator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'discriminator_adv_loss_params': {'average_by_discriminators': False, 'loss_type': 'mse'}, 'feat_match_loss_params': {'average_by_discriminators': False, 'average_by_layers': False, 'include_final_outputs': True}, 'mel_loss_params': {'fs': 24000, 'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'window': 'hann', 'n_mels': 80, 'fmin': 0, 'fmax': None, 'log_base': None}, 'lambda_adv': 1.0, 'lambda_mel': 45.0, 'lambda_feat_match': 2.0, 'lambda_var': 1.0, 'lambda_align': 2.0, 'sampling_rate': 24000, 'cache_generator_outputs': True}, unused_parameters=True, use_amp=False, use_matplotlib=True, use_preprocessor=True, use_tensorboard=True, use_wandb=False, val_scheduler_criterion=('valid', 'loss'), valid_batch_bins=None, valid_batch_size=None, valid_batch_type=None, valid_data_path_and_name_and_type=[('dump/raw/jvs010_dev/text', 'text', 'text'), ('dump/raw/jvs010_dev/wav.scp', 'speech', 'sound')], valid_max_cache_size=None, valid_shape_file=['exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp'], version='202204', wandb_entity=None, wandb_id=None, wandb_model_log_interval=-1, wandb_name=None, wandb_project=None, write_collected_feats=False) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +# Accounting: time=11 threads=1 +# Ended (code 0) at Tue Mar 4 21:23:37 JST 2025, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2430fdade0eecaa2378e73e580cdaf150f471ff6 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/config.yaml @@ -0,0 +1,386 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8 +ngpu: 0 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: true +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202204' +distributed: false diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f32acba55db9e15a1e48c3b5be00b3153b8925b5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da206629d56769eda2a43fb4f1d571544abe0f4363a6af12ad45c5a4fa596377 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c4ddf245779a2aeeffb9f8150642e6b2dd591e60 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27a07f4ee6c4d0bad1824745936515bae784fdb2496afe9cf336eafd49956416 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f32acba55db9e15a1e48c3b5be00b3153b8925b5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da206629d56769eda2a43fb4f1d571544abe0f4363a6af12ad45c5a4fa596377 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f04aeea8a62f55a1f349cb72359c00f14aa86634 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27e5581691218c8e4d347ca36281958a34f1c5bb987891b1dd4f0ec98adbf7c7 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f32acba55db9e15a1e48c3b5be00b3153b8925b5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da206629d56769eda2a43fb4f1d571544abe0f4363a6af12ad45c5a4fa596377 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1b03be30b061b04baab04a1f596010f771e0a5c9 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d0efa78fb0cf5f7921befa1ae4dd2b37cee1fd8c19948a594f7c7e95b2ac3b +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..bf5dbbf959f2da57cc11d4a235e5ab55a3372181 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/speech_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_089 164640 +jvs010_VOICEACTRESS100_090 155280 +jvs010_VOICEACTRESS100_091 88559 +jvs010_VOICEACTRESS100_092 170400 +jvs010_VOICEACTRESS100_093 235680 +jvs010_VOICEACTRESS100_094 348000 +jvs010_VOICEACTRESS100_095 180240 +jvs010_VOICEACTRESS100_096 216000 +jvs010_VOICEACTRESS100_097 223920 +jvs010_VOICEACTRESS100_098 138720 +jvs010_VOICEACTRESS100_099 110640 +jvs010_VOICEACTRESS100_100 228000 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..10af53c6549a69ab447b432afcc85f7508edd551 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/train/text_shape @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_089 73 +jvs010_VOICEACTRESS100_090 62 +jvs010_VOICEACTRESS100_091 38 +jvs010_VOICEACTRESS100_092 79 +jvs010_VOICEACTRESS100_093 106 +jvs010_VOICEACTRESS100_094 149 +jvs010_VOICEACTRESS100_095 87 +jvs010_VOICEACTRESS100_096 94 +jvs010_VOICEACTRESS100_097 102 +jvs010_VOICEACTRESS100_098 68 +jvs010_VOICEACTRESS100_099 51 +jvs010_VOICEACTRESS100_100 89 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/batch_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bbd083bc6eb3253b9639d393dd62090fb839697e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8198e47ae1b0a78a6b6d4aca7df0acb304da37918e1b4209bb78e9ab7871b6a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..06ad4dab93f64a303b6359358595fb2b4086b0c6 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be763ff2cc613966b77cbd0f3cbb0acf8afc4c71b5779f95cf156744a47f5a0d +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bbd083bc6eb3253b9639d393dd62090fb839697e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8198e47ae1b0a78a6b6d4aca7df0acb304da37918e1b4209bb78e9ab7871b6a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a9e386c55988ff64976c1d3db0ee19e278d1ecf8 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109b65c5c157892e424f5d4c803fddc402e8f4fbf7251ee4871f98334c3db7e9 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bbd083bc6eb3253b9639d393dd62090fb839697e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8198e47ae1b0a78a6b6d4aca7df0acb304da37918e1b4209bb78e9ab7871b6a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a99669957a630f4f086930ce2d7d54d4ededf256 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3dd256463e1ce3b3e26ce2a9a5a837cf997876e2e45d24499287f1f889a74ae +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..0542d7531ed63bd0eee86ca98621bd23fa2248ab --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/speech_shape @@ -0,0 +1 @@ +jvs010_BASIC5000_1889 61680 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/stats_keys b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..80f3d5a4b783a4f6822b81505030a47d89bd5685 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/stats_keys @@ -0,0 +1,6 @@ +feats +feats_lengths +pitch +pitch_lengths +energy +energy_lengths diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..9fe7e0db1ab2990cbc299615b5b6af7f2a1c7b54 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/stats.8/valid/text_shape @@ -0,0 +1 @@ +jvs010_BASIC5000_1889 39 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp new file mode 100644 index 0000000000000000000000000000000000000000..66c1fa47d4bddfa5943f986701440143bb5461b0 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.1.scp @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_001 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_001.wav +jvs010_VOICEACTRESS100_002 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_002.wav +jvs010_VOICEACTRESS100_003 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_003.wav +jvs010_VOICEACTRESS100_004 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_004.wav +jvs010_VOICEACTRESS100_005 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_005.wav +jvs010_VOICEACTRESS100_006 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_006.wav +jvs010_VOICEACTRESS100_007 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_007.wav +jvs010_VOICEACTRESS100_008 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_008.wav +jvs010_VOICEACTRESS100_009 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_009.wav +jvs010_VOICEACTRESS100_010 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_010.wav +jvs010_VOICEACTRESS100_011 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_011.wav +jvs010_VOICEACTRESS100_012 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_012.wav +jvs010_VOICEACTRESS100_013 dump/raw/org/jvs010_tr_no_dev/data/format.1/jvs010_VOICEACTRESS100_013.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp new file mode 100644 index 0000000000000000000000000000000000000000..e7365610c8ae73e3e14c24ce36053eacb283ec36 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.2.scp @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_014 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_014.wav +jvs010_VOICEACTRESS100_015 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_015.wav +jvs010_VOICEACTRESS100_016 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_016.wav +jvs010_VOICEACTRESS100_017 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_017.wav +jvs010_VOICEACTRESS100_018 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_018.wav +jvs010_VOICEACTRESS100_019 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_019.wav +jvs010_VOICEACTRESS100_020 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_020.wav +jvs010_VOICEACTRESS100_021 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_021.wav +jvs010_VOICEACTRESS100_022 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_022.wav +jvs010_VOICEACTRESS100_023 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_023.wav +jvs010_VOICEACTRESS100_024 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_024.wav +jvs010_VOICEACTRESS100_025 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_025.wav +jvs010_VOICEACTRESS100_026 dump/raw/org/jvs010_tr_no_dev/data/format.2/jvs010_VOICEACTRESS100_026.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp new file mode 100644 index 0000000000000000000000000000000000000000..37c24c83eb67cf839f1062b6a85b1e7072591681 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.3.scp @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_027 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_027.wav +jvs010_VOICEACTRESS100_028 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_028.wav +jvs010_VOICEACTRESS100_029 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_029.wav +jvs010_VOICEACTRESS100_030 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_030.wav +jvs010_VOICEACTRESS100_031 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_031.wav +jvs010_VOICEACTRESS100_032 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_032.wav +jvs010_VOICEACTRESS100_033 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_033.wav +jvs010_VOICEACTRESS100_034 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_034.wav +jvs010_VOICEACTRESS100_035 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_035.wav +jvs010_VOICEACTRESS100_036 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_036.wav +jvs010_VOICEACTRESS100_037 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_037.wav +jvs010_VOICEACTRESS100_038 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_038.wav +jvs010_VOICEACTRESS100_039 dump/raw/org/jvs010_tr_no_dev/data/format.3/jvs010_VOICEACTRESS100_039.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp new file mode 100644 index 0000000000000000000000000000000000000000..3bd64592ea88c5f1bbd861e74730e090820bc9cb --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.4.scp @@ -0,0 +1,13 @@ +jvs010_VOICEACTRESS100_040 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_040.wav +jvs010_VOICEACTRESS100_041 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_041.wav +jvs010_VOICEACTRESS100_042 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_042.wav +jvs010_VOICEACTRESS100_043 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_043.wav +jvs010_VOICEACTRESS100_044 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_044.wav +jvs010_VOICEACTRESS100_045 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_045.wav +jvs010_VOICEACTRESS100_046 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_046.wav +jvs010_VOICEACTRESS100_047 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_047.wav +jvs010_VOICEACTRESS100_048 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_048.wav +jvs010_VOICEACTRESS100_049 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_049.wav +jvs010_VOICEACTRESS100_050 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_050.wav +jvs010_VOICEACTRESS100_051 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_051.wav +jvs010_VOICEACTRESS100_052 dump/raw/org/jvs010_tr_no_dev/data/format.4/jvs010_VOICEACTRESS100_052.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp new file mode 100644 index 0000000000000000000000000000000000000000..cec0fcc1eaedcc3a691a65d0ad6839e279a0eb90 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.5.scp @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_053 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_053.wav +jvs010_VOICEACTRESS100_054 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_054.wav +jvs010_VOICEACTRESS100_055 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_055.wav +jvs010_VOICEACTRESS100_056 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_056.wav +jvs010_VOICEACTRESS100_057 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_057.wav +jvs010_VOICEACTRESS100_058 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_058.wav +jvs010_VOICEACTRESS100_059 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_059.wav +jvs010_VOICEACTRESS100_060 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_060.wav +jvs010_VOICEACTRESS100_061 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_061.wav +jvs010_VOICEACTRESS100_062 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_062.wav +jvs010_VOICEACTRESS100_063 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_063.wav +jvs010_VOICEACTRESS100_064 dump/raw/org/jvs010_tr_no_dev/data/format.5/jvs010_VOICEACTRESS100_064.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp new file mode 100644 index 0000000000000000000000000000000000000000..e5afc5942b777054e4dd16a862f5828510d87726 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.6.scp @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_065 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_065.wav +jvs010_VOICEACTRESS100_066 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_066.wav +jvs010_VOICEACTRESS100_067 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_067.wav +jvs010_VOICEACTRESS100_068 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_068.wav +jvs010_VOICEACTRESS100_069 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_069.wav +jvs010_VOICEACTRESS100_070 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_070.wav +jvs010_VOICEACTRESS100_071 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_071.wav +jvs010_VOICEACTRESS100_072 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_072.wav +jvs010_VOICEACTRESS100_073 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_073.wav +jvs010_VOICEACTRESS100_074 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_074.wav +jvs010_VOICEACTRESS100_075 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_075.wav +jvs010_VOICEACTRESS100_076 dump/raw/org/jvs010_tr_no_dev/data/format.6/jvs010_VOICEACTRESS100_076.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp new file mode 100644 index 0000000000000000000000000000000000000000..7c6726a2e40a5b6d54b6ef58f0de28599360afcf --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.7.scp @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_077 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_077.wav +jvs010_VOICEACTRESS100_078 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_078.wav +jvs010_VOICEACTRESS100_079 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_079.wav +jvs010_VOICEACTRESS100_080 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_080.wav +jvs010_VOICEACTRESS100_081 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_081.wav +jvs010_VOICEACTRESS100_082 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_082.wav +jvs010_VOICEACTRESS100_083 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_083.wav +jvs010_VOICEACTRESS100_084 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_084.wav +jvs010_VOICEACTRESS100_085 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_085.wav +jvs010_VOICEACTRESS100_086 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_086.wav +jvs010_VOICEACTRESS100_087 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_087.wav +jvs010_VOICEACTRESS100_088 dump/raw/org/jvs010_tr_no_dev/data/format.7/jvs010_VOICEACTRESS100_088.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp new file mode 100644 index 0000000000000000000000000000000000000000..51002cdd1cfb9eccdcfa60e96f1dcfc8ec1b85fc --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/train.8.scp @@ -0,0 +1,12 @@ +jvs010_VOICEACTRESS100_089 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_089.wav +jvs010_VOICEACTRESS100_090 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_090.wav +jvs010_VOICEACTRESS100_091 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_091.wav +jvs010_VOICEACTRESS100_092 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_092.wav +jvs010_VOICEACTRESS100_093 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_093.wav +jvs010_VOICEACTRESS100_094 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_094.wav +jvs010_VOICEACTRESS100_095 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_095.wav +jvs010_VOICEACTRESS100_096 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_096.wav +jvs010_VOICEACTRESS100_097 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_097.wav +jvs010_VOICEACTRESS100_098 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_098.wav +jvs010_VOICEACTRESS100_099 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_099.wav +jvs010_VOICEACTRESS100_100 dump/raw/org/jvs010_tr_no_dev/data/format.8/jvs010_VOICEACTRESS100_100.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp new file mode 100644 index 0000000000000000000000000000000000000000..003aec17497a9efe6835f5159be3b05c17256666 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.1.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0113 dump/raw/org/jvs010_dev/data/format.1/jvs010_BASIC5000_0113.wav +jvs010_BASIC5000_0261 dump/raw/org/jvs010_dev/data/format.1/jvs010_BASIC5000_0261.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp new file mode 100644 index 0000000000000000000000000000000000000000..322610fccc735bd127e36625a1b2a087d427c073 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.2.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_0351 dump/raw/org/jvs010_dev/data/format.2/jvs010_BASIC5000_0351.wav +jvs010_BASIC5000_0882 dump/raw/org/jvs010_dev/data/format.2/jvs010_BASIC5000_0882.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp new file mode 100644 index 0000000000000000000000000000000000000000..d75867f1b9f71deb4e6a8b31a5b5a169fa6e0fa1 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.3.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1009 dump/raw/org/jvs010_dev/data/format.3/jvs010_BASIC5000_1009.wav +jvs010_BASIC5000_1087 dump/raw/org/jvs010_dev/data/format.3/jvs010_BASIC5000_1087.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp new file mode 100644 index 0000000000000000000000000000000000000000..bbcb1fff855ef886b7c365b2dfe93e9066f2ba1a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.4.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1122 dump/raw/org/jvs010_dev/data/format.4/jvs010_BASIC5000_1122.wav +jvs010_BASIC5000_1274 dump/raw/org/jvs010_dev/data/format.4/jvs010_BASIC5000_1274.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp new file mode 100644 index 0000000000000000000000000000000000000000..317541e85451ee010c138268c1cd7de2c9f25a29 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.5.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1328 dump/raw/org/jvs010_dev/data/format.5/jvs010_BASIC5000_1328.wav +jvs010_BASIC5000_1355 dump/raw/org/jvs010_dev/data/format.5/jvs010_BASIC5000_1355.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp new file mode 100644 index 0000000000000000000000000000000000000000..ee723bd7a57fe28ed30810992a79abaadb4552a9 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.6.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1474 dump/raw/org/jvs010_dev/data/format.6/jvs010_BASIC5000_1474.wav +jvs010_BASIC5000_1559 dump/raw/org/jvs010_dev/data/format.6/jvs010_BASIC5000_1559.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp new file mode 100644 index 0000000000000000000000000000000000000000..12bf8005b1f8b2a4cd46098e61925e84b494240e --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.7.scp @@ -0,0 +1,2 @@ +jvs010_BASIC5000_1645 dump/raw/org/jvs010_dev/data/format.7/jvs010_BASIC5000_1645.wav +jvs010_BASIC5000_1849 dump/raw/org/jvs010_dev/data/format.7/jvs010_BASIC5000_1849.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp new file mode 100644 index 0000000000000000000000000000000000000000..2ab86cf93715316ccdebbf67e610aeca3c8aae54 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/logdir/valid.8.scp @@ -0,0 +1 @@ +jvs010_BASIC5000_1889 dump/raw/org/jvs010_dev/data/format.8/jvs010_BASIC5000_1889.wav diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/run.sh b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..45a006d35444d749c8dd8496a60ccbd3763715f6 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/run.sh @@ -0,0 +1 @@ +./tts.sh --lang jp --local_data_opts '--spk jvs010' --feats_type raw --fs 24000 --n_fft 2048 --n_shift 300 --win_length 1200 --token_type phn --cleaner jaconv --g2p pyopenjtalk --train_config conf/finetune.yaml --inference_config conf/decode.yaml --train_set jvs010_tr_no_dev --valid_set jvs010_dev --test_sets 'jvs010_dev jvs010_eval1' --srctexts data/jvs010_tr_no_dev/text --audio_format wav --train_config conf/tuning/train_jets.yaml --tts_task gan_tts --stage 1 --stop_stage 7 --ngpu 4 --stage 5 "$@"; exit $? diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8a68f08fb4837f6185205e93c3f080b292ac5900 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e05f6213172d6d162590ff91b20476dbd3264aad75104f9e648da5e60eb39a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..3e9a9532376b574fb220cc57035499399d78c480 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25848d66f29635f41196dbc0239c0bbcb0410edf788a3d9455c5958f800c43e9 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8a68f08fb4837f6185205e93c3f080b292ac5900 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e05f6213172d6d162590ff91b20476dbd3264aad75104f9e648da5e60eb39a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7d3a49407168b6a26233c25d2c291067f03be768 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c5e334600e21fcf6e625cc6157e8b4d86c306b0e0f6161140e5bb03619cebff +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8a68f08fb4837f6185205e93c3f080b292ac5900 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e05f6213172d6d162590ff91b20476dbd3264aad75104f9e648da5e60eb39a +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bed42988adbeda9a7235329d8673483fdd754839 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29b5647627098602bc97d5405a13e35d4e8fc54d670b37a2e8c1a199d7a5d07b +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..9aca23e9257e4bb2e44f8e4d9a9c274f5c2afe9d --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape @@ -0,0 +1,100 @@ +jvs010_VOICEACTRESS100_001 178800 +jvs010_VOICEACTRESS100_002 198240 +jvs010_VOICEACTRESS100_003 128640 +jvs010_VOICEACTRESS100_004 132000 +jvs010_VOICEACTRESS100_005 277440 +jvs010_VOICEACTRESS100_006 94560 +jvs010_VOICEACTRESS100_007 182160 +jvs010_VOICEACTRESS100_008 180960 +jvs010_VOICEACTRESS100_009 145920 +jvs010_VOICEACTRESS100_010 105359 +jvs010_VOICEACTRESS100_011 148080 +jvs010_VOICEACTRESS100_012 130320 +jvs010_VOICEACTRESS100_013 117839 +jvs010_VOICEACTRESS100_014 56160 +jvs010_VOICEACTRESS100_015 104640 +jvs010_VOICEACTRESS100_016 122880 +jvs010_VOICEACTRESS100_017 148080 +jvs010_VOICEACTRESS100_018 161041 +jvs010_VOICEACTRESS100_019 185040 +jvs010_VOICEACTRESS100_020 162000 +jvs010_VOICEACTRESS100_021 154320 +jvs010_VOICEACTRESS100_022 314880 +jvs010_VOICEACTRESS100_023 150480 +jvs010_VOICEACTRESS100_024 197040 +jvs010_VOICEACTRESS100_025 98880 +jvs010_VOICEACTRESS100_026 89040 +jvs010_VOICEACTRESS100_027 155280 +jvs010_VOICEACTRESS100_028 224879 +jvs010_VOICEACTRESS100_029 196320 +jvs010_VOICEACTRESS100_030 126000 +jvs010_VOICEACTRESS100_031 104160 +jvs010_VOICEACTRESS100_032 230400 +jvs010_VOICEACTRESS100_033 75840 +jvs010_VOICEACTRESS100_034 86880 +jvs010_VOICEACTRESS100_035 187200 +jvs010_VOICEACTRESS100_036 176640 +jvs010_VOICEACTRESS100_037 131520 +jvs010_VOICEACTRESS100_038 160800 +jvs010_VOICEACTRESS100_039 154320 +jvs010_VOICEACTRESS100_040 100560 +jvs010_VOICEACTRESS100_041 137040 +jvs010_VOICEACTRESS100_042 102000 +jvs010_VOICEACTRESS100_043 168480 +jvs010_VOICEACTRESS100_044 228240 +jvs010_VOICEACTRESS100_045 205679 +jvs010_VOICEACTRESS100_046 133200 +jvs010_VOICEACTRESS100_047 190080 +jvs010_VOICEACTRESS100_048 194160 +jvs010_VOICEACTRESS100_049 138000 +jvs010_VOICEACTRESS100_050 182160 +jvs010_VOICEACTRESS100_051 113520 +jvs010_VOICEACTRESS100_052 201840 +jvs010_VOICEACTRESS100_053 132000 +jvs010_VOICEACTRESS100_054 294960 +jvs010_VOICEACTRESS100_055 75360 +jvs010_VOICEACTRESS100_056 188880 +jvs010_VOICEACTRESS100_057 113520 +jvs010_VOICEACTRESS100_058 132720 +jvs010_VOICEACTRESS100_059 145680 +jvs010_VOICEACTRESS100_060 146881 +jvs010_VOICEACTRESS100_061 165600 +jvs010_VOICEACTRESS100_062 160080 +jvs010_VOICEACTRESS100_063 93600 +jvs010_VOICEACTRESS100_064 119520 +jvs010_VOICEACTRESS100_065 193200 +jvs010_VOICEACTRESS100_066 171360 +jvs010_VOICEACTRESS100_067 157440 +jvs010_VOICEACTRESS100_068 135840 +jvs010_VOICEACTRESS100_069 218400 +jvs010_VOICEACTRESS100_070 163200 +jvs010_VOICEACTRESS100_071 147120 +jvs010_VOICEACTRESS100_072 156960 +jvs010_VOICEACTRESS100_073 153600 +jvs010_VOICEACTRESS100_074 120960 +jvs010_VOICEACTRESS100_075 155280 +jvs010_VOICEACTRESS100_076 242640 +jvs010_VOICEACTRESS100_077 216719 +jvs010_VOICEACTRESS100_078 238800 +jvs010_VOICEACTRESS100_079 196320 +jvs010_VOICEACTRESS100_080 158640 +jvs010_VOICEACTRESS100_081 207360 +jvs010_VOICEACTRESS100_082 148560 +jvs010_VOICEACTRESS100_083 145200 +jvs010_VOICEACTRESS100_084 181440 +jvs010_VOICEACTRESS100_085 258480 +jvs010_VOICEACTRESS100_086 178320 +jvs010_VOICEACTRESS100_087 231840 +jvs010_VOICEACTRESS100_088 156960 +jvs010_VOICEACTRESS100_089 164640 +jvs010_VOICEACTRESS100_090 155280 +jvs010_VOICEACTRESS100_091 88559 +jvs010_VOICEACTRESS100_092 170400 +jvs010_VOICEACTRESS100_093 235680 +jvs010_VOICEACTRESS100_094 348000 +jvs010_VOICEACTRESS100_095 180240 +jvs010_VOICEACTRESS100_096 216000 +jvs010_VOICEACTRESS100_097 223920 +jvs010_VOICEACTRESS100_098 138720 +jvs010_VOICEACTRESS100_099 110640 +jvs010_VOICEACTRESS100_100 228000 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..ef6d7ca5c3c8390684e158784b51afd5d1a8baa5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape @@ -0,0 +1,100 @@ +jvs010_VOICEACTRESS100_001 78 +jvs010_VOICEACTRESS100_002 91 +jvs010_VOICEACTRESS100_003 69 +jvs010_VOICEACTRESS100_004 64 +jvs010_VOICEACTRESS100_005 121 +jvs010_VOICEACTRESS100_006 49 +jvs010_VOICEACTRESS100_007 93 +jvs010_VOICEACTRESS100_008 82 +jvs010_VOICEACTRESS100_009 69 +jvs010_VOICEACTRESS100_010 49 +jvs010_VOICEACTRESS100_011 77 +jvs010_VOICEACTRESS100_012 55 +jvs010_VOICEACTRESS100_013 54 +jvs010_VOICEACTRESS100_014 33 +jvs010_VOICEACTRESS100_015 47 +jvs010_VOICEACTRESS100_016 62 +jvs010_VOICEACTRESS100_017 60 +jvs010_VOICEACTRESS100_018 78 +jvs010_VOICEACTRESS100_019 87 +jvs010_VOICEACTRESS100_020 80 +jvs010_VOICEACTRESS100_021 67 +jvs010_VOICEACTRESS100_022 124 +jvs010_VOICEACTRESS100_023 69 +jvs010_VOICEACTRESS100_024 86 +jvs010_VOICEACTRESS100_025 54 +jvs010_VOICEACTRESS100_026 40 +jvs010_VOICEACTRESS100_027 69 +jvs010_VOICEACTRESS100_028 101 +jvs010_VOICEACTRESS100_029 93 +jvs010_VOICEACTRESS100_030 61 +jvs010_VOICEACTRESS100_031 43 +jvs010_VOICEACTRESS100_032 105 +jvs010_VOICEACTRESS100_033 38 +jvs010_VOICEACTRESS100_034 42 +jvs010_VOICEACTRESS100_035 78 +jvs010_VOICEACTRESS100_036 77 +jvs010_VOICEACTRESS100_037 55 +jvs010_VOICEACTRESS100_038 74 +jvs010_VOICEACTRESS100_039 63 +jvs010_VOICEACTRESS100_040 42 +jvs010_VOICEACTRESS100_041 69 +jvs010_VOICEACTRESS100_042 43 +jvs010_VOICEACTRESS100_043 75 +jvs010_VOICEACTRESS100_044 91 +jvs010_VOICEACTRESS100_045 84 +jvs010_VOICEACTRESS100_046 52 +jvs010_VOICEACTRESS100_047 81 +jvs010_VOICEACTRESS100_048 87 +jvs010_VOICEACTRESS100_049 64 +jvs010_VOICEACTRESS100_050 88 +jvs010_VOICEACTRESS100_051 53 +jvs010_VOICEACTRESS100_052 85 +jvs010_VOICEACTRESS100_053 62 +jvs010_VOICEACTRESS100_054 125 +jvs010_VOICEACTRESS100_055 40 +jvs010_VOICEACTRESS100_056 80 +jvs010_VOICEACTRESS100_057 60 +jvs010_VOICEACTRESS100_058 66 +jvs010_VOICEACTRESS100_059 71 +jvs010_VOICEACTRESS100_060 69 +jvs010_VOICEACTRESS100_061 83 +jvs010_VOICEACTRESS100_062 76 +jvs010_VOICEACTRESS100_063 47 +jvs010_VOICEACTRESS100_064 60 +jvs010_VOICEACTRESS100_065 101 +jvs010_VOICEACTRESS100_066 77 +jvs010_VOICEACTRESS100_067 74 +jvs010_VOICEACTRESS100_068 63 +jvs010_VOICEACTRESS100_069 103 +jvs010_VOICEACTRESS100_070 81 +jvs010_VOICEACTRESS100_071 68 +jvs010_VOICEACTRESS100_072 73 +jvs010_VOICEACTRESS100_073 67 +jvs010_VOICEACTRESS100_074 54 +jvs010_VOICEACTRESS100_075 70 +jvs010_VOICEACTRESS100_076 89 +jvs010_VOICEACTRESS100_077 91 +jvs010_VOICEACTRESS100_078 101 +jvs010_VOICEACTRESS100_079 88 +jvs010_VOICEACTRESS100_080 77 +jvs010_VOICEACTRESS100_081 95 +jvs010_VOICEACTRESS100_082 75 +jvs010_VOICEACTRESS100_083 61 +jvs010_VOICEACTRESS100_084 86 +jvs010_VOICEACTRESS100_085 111 +jvs010_VOICEACTRESS100_086 70 +jvs010_VOICEACTRESS100_087 101 +jvs010_VOICEACTRESS100_088 65 +jvs010_VOICEACTRESS100_089 73 +jvs010_VOICEACTRESS100_090 62 +jvs010_VOICEACTRESS100_091 38 +jvs010_VOICEACTRESS100_092 79 +jvs010_VOICEACTRESS100_093 106 +jvs010_VOICEACTRESS100_094 149 +jvs010_VOICEACTRESS100_095 87 +jvs010_VOICEACTRESS100_096 94 +jvs010_VOICEACTRESS100_097 102 +jvs010_VOICEACTRESS100_098 68 +jvs010_VOICEACTRESS100_099 51 +jvs010_VOICEACTRESS100_100 89 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn new file mode 100644 index 0000000000000000000000000000000000000000..68e6ab2a0ca0c5bf15f730122b649e46d4c913b5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn @@ -0,0 +1,100 @@ +jvs010_VOICEACTRESS100_001 78,41 +jvs010_VOICEACTRESS100_002 91,41 +jvs010_VOICEACTRESS100_003 69,41 +jvs010_VOICEACTRESS100_004 64,41 +jvs010_VOICEACTRESS100_005 121,41 +jvs010_VOICEACTRESS100_006 49,41 +jvs010_VOICEACTRESS100_007 93,41 +jvs010_VOICEACTRESS100_008 82,41 +jvs010_VOICEACTRESS100_009 69,41 +jvs010_VOICEACTRESS100_010 49,41 +jvs010_VOICEACTRESS100_011 77,41 +jvs010_VOICEACTRESS100_012 55,41 +jvs010_VOICEACTRESS100_013 54,41 +jvs010_VOICEACTRESS100_014 33,41 +jvs010_VOICEACTRESS100_015 47,41 +jvs010_VOICEACTRESS100_016 62,41 +jvs010_VOICEACTRESS100_017 60,41 +jvs010_VOICEACTRESS100_018 78,41 +jvs010_VOICEACTRESS100_019 87,41 +jvs010_VOICEACTRESS100_020 80,41 +jvs010_VOICEACTRESS100_021 67,41 +jvs010_VOICEACTRESS100_022 124,41 +jvs010_VOICEACTRESS100_023 69,41 +jvs010_VOICEACTRESS100_024 86,41 +jvs010_VOICEACTRESS100_025 54,41 +jvs010_VOICEACTRESS100_026 40,41 +jvs010_VOICEACTRESS100_027 69,41 +jvs010_VOICEACTRESS100_028 101,41 +jvs010_VOICEACTRESS100_029 93,41 +jvs010_VOICEACTRESS100_030 61,41 +jvs010_VOICEACTRESS100_031 43,41 +jvs010_VOICEACTRESS100_032 105,41 +jvs010_VOICEACTRESS100_033 38,41 +jvs010_VOICEACTRESS100_034 42,41 +jvs010_VOICEACTRESS100_035 78,41 +jvs010_VOICEACTRESS100_036 77,41 +jvs010_VOICEACTRESS100_037 55,41 +jvs010_VOICEACTRESS100_038 74,41 +jvs010_VOICEACTRESS100_039 63,41 +jvs010_VOICEACTRESS100_040 42,41 +jvs010_VOICEACTRESS100_041 69,41 +jvs010_VOICEACTRESS100_042 43,41 +jvs010_VOICEACTRESS100_043 75,41 +jvs010_VOICEACTRESS100_044 91,41 +jvs010_VOICEACTRESS100_045 84,41 +jvs010_VOICEACTRESS100_046 52,41 +jvs010_VOICEACTRESS100_047 81,41 +jvs010_VOICEACTRESS100_048 87,41 +jvs010_VOICEACTRESS100_049 64,41 +jvs010_VOICEACTRESS100_050 88,41 +jvs010_VOICEACTRESS100_051 53,41 +jvs010_VOICEACTRESS100_052 85,41 +jvs010_VOICEACTRESS100_053 62,41 +jvs010_VOICEACTRESS100_054 125,41 +jvs010_VOICEACTRESS100_055 40,41 +jvs010_VOICEACTRESS100_056 80,41 +jvs010_VOICEACTRESS100_057 60,41 +jvs010_VOICEACTRESS100_058 66,41 +jvs010_VOICEACTRESS100_059 71,41 +jvs010_VOICEACTRESS100_060 69,41 +jvs010_VOICEACTRESS100_061 83,41 +jvs010_VOICEACTRESS100_062 76,41 +jvs010_VOICEACTRESS100_063 47,41 +jvs010_VOICEACTRESS100_064 60,41 +jvs010_VOICEACTRESS100_065 101,41 +jvs010_VOICEACTRESS100_066 77,41 +jvs010_VOICEACTRESS100_067 74,41 +jvs010_VOICEACTRESS100_068 63,41 +jvs010_VOICEACTRESS100_069 103,41 +jvs010_VOICEACTRESS100_070 81,41 +jvs010_VOICEACTRESS100_071 68,41 +jvs010_VOICEACTRESS100_072 73,41 +jvs010_VOICEACTRESS100_073 67,41 +jvs010_VOICEACTRESS100_074 54,41 +jvs010_VOICEACTRESS100_075 70,41 +jvs010_VOICEACTRESS100_076 89,41 +jvs010_VOICEACTRESS100_077 91,41 +jvs010_VOICEACTRESS100_078 101,41 +jvs010_VOICEACTRESS100_079 88,41 +jvs010_VOICEACTRESS100_080 77,41 +jvs010_VOICEACTRESS100_081 95,41 +jvs010_VOICEACTRESS100_082 75,41 +jvs010_VOICEACTRESS100_083 61,41 +jvs010_VOICEACTRESS100_084 86,41 +jvs010_VOICEACTRESS100_085 111,41 +jvs010_VOICEACTRESS100_086 70,41 +jvs010_VOICEACTRESS100_087 101,41 +jvs010_VOICEACTRESS100_088 65,41 +jvs010_VOICEACTRESS100_089 73,41 +jvs010_VOICEACTRESS100_090 62,41 +jvs010_VOICEACTRESS100_091 38,41 +jvs010_VOICEACTRESS100_092 79,41 +jvs010_VOICEACTRESS100_093 106,41 +jvs010_VOICEACTRESS100_094 149,41 +jvs010_VOICEACTRESS100_095 87,41 +jvs010_VOICEACTRESS100_096 94,41 +jvs010_VOICEACTRESS100_097 102,41 +jvs010_VOICEACTRESS100_098 68,41 +jvs010_VOICEACTRESS100_099 51,41 +jvs010_VOICEACTRESS100_100 89,41 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ae6f1e13e8dec058b98660a9825d5610db059609 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e530a389af7d4fda62c559befe36d04b95e949badbfa50ae5ec1690f514c94b2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..9f248a5ab2f0d81afc716e86866bca6897018fd5 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/energy_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8dbff525a5daa34c3059395fdcbf17a1adaaf80f164a4309e63a04029b2a2f1 +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ae6f1e13e8dec058b98660a9825d5610db059609 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e530a389af7d4fda62c559befe36d04b95e949badbfa50ae5ec1690f514c94b2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e4db560ae04e32e7b2b2a0a3b89752d9c12aae5a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a2218f1641993e100999476bd56a0c812bbacd9b5a1e1ce8a3236330bf91a61 +size 1402 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_lengths_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ae6f1e13e8dec058b98660a9825d5610db059609 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e530a389af7d4fda62c559befe36d04b95e949badbfa50ae5ec1690f514c94b2 +size 778 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_stats.npz b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f8786a89e8a5eb299d04fbf5604e9051c484e50a --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/pitch_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a45adf6339d522b67fe12c47238cad9a347aa864a34eaa9e390861f753be6ed +size 770 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..0a30f55f5b67285715e1c8f1fa763a24d3f10977 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape @@ -0,0 +1,15 @@ +jvs010_BASIC5000_0113 143040 +jvs010_BASIC5000_0261 77520 +jvs010_BASIC5000_0351 72720 +jvs010_BASIC5000_0882 91440 +jvs010_BASIC5000_1009 73920 +jvs010_BASIC5000_1087 182160 +jvs010_BASIC5000_1122 108000 +jvs010_BASIC5000_1274 67680 +jvs010_BASIC5000_1328 94319 +jvs010_BASIC5000_1355 102720 +jvs010_BASIC5000_1474 87119 +jvs010_BASIC5000_1559 177120 +jvs010_BASIC5000_1645 131280 +jvs010_BASIC5000_1849 205440 +jvs010_BASIC5000_1889 61680 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..b2769e8d36bbcff6130f58cdf379bcbfc0ac1ea8 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape @@ -0,0 +1,15 @@ +jvs010_BASIC5000_0113 56 +jvs010_BASIC5000_0261 39 +jvs010_BASIC5000_0351 36 +jvs010_BASIC5000_0882 49 +jvs010_BASIC5000_1009 38 +jvs010_BASIC5000_1087 96 +jvs010_BASIC5000_1122 54 +jvs010_BASIC5000_1274 35 +jvs010_BASIC5000_1328 49 +jvs010_BASIC5000_1355 45 +jvs010_BASIC5000_1474 47 +jvs010_BASIC5000_1559 82 +jvs010_BASIC5000_1645 64 +jvs010_BASIC5000_1849 81 +jvs010_BASIC5000_1889 39 diff --git a/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn new file mode 100644 index 0000000000000000000000000000000000000000..71f0b9566761e7a1ad3ea075251f5cdd19bc6c84 --- /dev/null +++ b/exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn @@ -0,0 +1,15 @@ +jvs010_BASIC5000_0113 56,41 +jvs010_BASIC5000_0261 39,41 +jvs010_BASIC5000_0351 36,41 +jvs010_BASIC5000_0882 49,41 +jvs010_BASIC5000_1009 38,41 +jvs010_BASIC5000_1087 96,41 +jvs010_BASIC5000_1122 54,41 +jvs010_BASIC5000_1274 35,41 +jvs010_BASIC5000_1328 49,41 +jvs010_BASIC5000_1355 45,41 +jvs010_BASIC5000_1474 47,41 +jvs010_BASIC5000_1559 82,41 +jvs010_BASIC5000_1645 64,41 +jvs010_BASIC5000_1849 81,41 +jvs010_BASIC5000_1889 39,41 diff --git a/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..60efe42f387badcea6b28e88efd8f231f95b3f63 --- /dev/null +++ b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml @@ -0,0 +1,393 @@ +config: conf/tuning/train_jets.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk +ngpu: 1 +seed: 777 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: 4 +dist_rank: 0 +local_rank: 0 +dist_master_addr: localhost +dist_master_port: 52975 +dist_launcher: null +multiprocessing_distributed: true +unused_parameters: true +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: false +collect_stats: false +write_collected_feats: false +max_epoch: 130 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - text2mel_loss + - min +- - train + - text2mel_loss + - min +- - train + - total_count + - max +keep_nbest_models: -1 +nbest_averaging_interval: 0 +grad_clip: -1 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: false +log_interval: 50 +use_matplotlib: true +use_tensorboard: true +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 1000 +batch_size: 20 +valid_batch_size: null +batch_bins: 6000000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape +valid_shape_file: +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn +- exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape +batch_type: numel +valid_batch_type: null +fold_length: +- 150 +- 240000 +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +train_data_path_and_name_and_type: +- - dump/raw/jvs010_tr_no_dev/text + - text + - text +- - dump/raw/jvs010_tr_no_dev/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/jvs010_dev/text + - text + - text +- - dump/raw/jvs010_dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +optim: adamw +optim_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler: exponentiallr +scheduler_conf: + gamma: 0.999875 +optim2: adamw +optim2_conf: + lr: 0.0002 + betas: + - 0.8 + - 0.99 + eps: 1.0e-09 + weight_decay: 0.0 +scheduler2: exponentiallr +scheduler2_conf: + gamma: 0.999875 +generator_first: true +token_list: +- +- +- o +- a +- u +- i +- e +- k +- r +- t +- n +- pau +- N +- s +- sh +- d +- m +- g +- w +- b +- cl +- I +- j +- ch +- y +- U +- h +- p +- ts +- f +- z +- ky +- ny +- gy +- ry +- hy +- my +- by +- py +- v +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: jaconv +g2p: pyopenjtalk +feats_extract: fbank +feats_extract_conf: + n_fft: 2048 + hop_length: 300 + win_length: 1200 + fs: 24000 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: global_mvn +normalize_conf: + stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz +tts: jets +tts_conf: + generator_type: jets_generator + generator_params: + adim: 256 + aheads: 2 + elayers: 4 + eunits: 1024 + dlayers: 4 + dunits: 1024 + positionwise_layer_type: conv1d + positionwise_conv_kernel_size: 3 + duration_predictor_layers: 2 + duration_predictor_chans: 256 + duration_predictor_kernel_size: 3 + use_masking: true + encoder_normalize_before: true + decoder_normalize_before: true + encoder_type: transformer + decoder_type: transformer + conformer_rel_pos_type: latest + conformer_pos_enc_layer_type: rel_pos + conformer_self_attn_layer_type: rel_selfattn + conformer_activation_type: swish + use_macaron_style_in_conformer: true + use_cnn_in_conformer: true + conformer_enc_kernel_size: 7 + conformer_dec_kernel_size: 31 + init_type: xavier_uniform + transformer_enc_dropout_rate: 0.2 + transformer_enc_positional_dropout_rate: 0.2 + transformer_enc_attn_dropout_rate: 0.2 + transformer_dec_dropout_rate: 0.2 + transformer_dec_positional_dropout_rate: 0.2 + transformer_dec_attn_dropout_rate: 0.2 + pitch_predictor_layers: 5 + pitch_predictor_chans: 256 + pitch_predictor_kernel_size: 5 + pitch_predictor_dropout: 0.5 + pitch_embed_kernel_size: 1 + pitch_embed_dropout: 0.0 + stop_gradient_from_pitch_predictor: true + energy_predictor_layers: 2 + energy_predictor_chans: 256 + energy_predictor_kernel_size: 3 + energy_predictor_dropout: 0.5 + energy_embed_kernel_size: 1 + energy_embed_dropout: 0.0 + stop_gradient_from_energy_predictor: false + generator_out_channels: 1 + generator_channels: 512 + generator_global_channels: -1 + generator_kernel_size: 7 + generator_upsample_scales: + - 8 + - 8 + - 2 + - 2 + generator_upsample_kernel_sizes: + - 16 + - 16 + - 4 + - 4 + generator_resblock_kernel_sizes: + - 3 + - 7 + - 11 + generator_resblock_dilations: + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + - - 1 + - 3 + - 5 + generator_use_additional_convs: true + generator_bias: true + generator_nonlinear_activation: LeakyReLU + generator_nonlinear_activation_params: + negative_slope: 0.1 + generator_use_weight_norm: true + segment_size: 64 + idim: 41 + odim: 80 + discriminator_type: hifigan_multi_scale_multi_period_discriminator + discriminator_params: + scales: 1 + scale_downsample_pooling: AvgPool1d + scale_downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + scale_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 15 + - 41 + - 5 + - 3 + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: + - 2 + - 2 + - 4 + - 4 + - 1 + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + follow_official_norm: false + periods: + - 2 + - 3 + - 5 + - 7 + - 11 + period_discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: + - 5 + - 3 + channels: 32 + downsample_scales: + - 3 + - 3 + - 3 + - 3 + - 1 + max_downsample_channels: 1024 + bias: true + nonlinear_activation: LeakyReLU + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + use_spectral_norm: false + generator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + discriminator_adv_loss_params: + average_by_discriminators: false + loss_type: mse + feat_match_loss_params: + average_by_discriminators: false + average_by_layers: false + include_final_outputs: true + mel_loss_params: + fs: 24000 + n_fft: 1024 + hop_length: 256 + win_length: null + window: hann + n_mels: 80 + fmin: 0 + fmax: null + log_base: null + lambda_adv: 1.0 + lambda_mel: 45.0 + lambda_feat_match: 2.0 + lambda_var: 1.0 + lambda_align: 2.0 + sampling_rate: 24000 + cache_generator_outputs: true +pitch_extract: dio +pitch_extract_conf: + reduction_factor: 1 + use_token_averaged_f0: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + f0max: 400 + f0min: 80 +pitch_normalize: global_mvn +pitch_normalize_conf: + stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz +energy_extract: energy +energy_extract_conf: + reduction_factor: 1 + use_token_averaged_energy: false + fs: 24000 + n_fft: 2048 + hop_length: 300 + win_length: 1200 +energy_normalize: global_mvn +energy_normalize_conf: + stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz +required: +- output_dir +- token_list +version: '202204' +distributed: true diff --git a/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/run.sh b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..cca74106b3af636fb056ebcd05059f456a275fd8 --- /dev/null +++ b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/run.sh @@ -0,0 +1 @@ +./tts.sh --lang jp --local_data_opts '--spk jvs010' --feats_type raw --fs 24000 --n_fft 2048 --n_shift 300 --win_length 1200 --token_type phn --cleaner jaconv --g2p pyopenjtalk --train_config conf/finetune.yaml --inference_config conf/decode.yaml --train_set jvs010_tr_no_dev --valid_set jvs010_dev --test_sets 'jvs010_dev jvs010_eval1' --srctexts data/jvs010_tr_no_dev/text --audio_format wav --train_config conf/tuning/train_jets.yaml --tts_task gan_tts --stage 1 --stop_stage 7 --ngpu 4 --stage 6 "$@"; exit $? diff --git a/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091035.92b100c97f43.1159464.0 b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091035.92b100c97f43.1159464.0 new file mode 100644 index 0000000000000000000000000000000000000000..b449d1189075d07ad1ad33de61b432ab6ad6db16 --- /dev/null +++ b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/train/events.out.tfevents.1741091035.92b100c97f43.1159464.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8704d80d583693bc56ad0906c5fa8f30dc6156977506f396db7d40a7c398bd +size 4873 diff --git a/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091035.92b100c97f43.1159464.1 b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091035.92b100c97f43.1159464.1 new file mode 100644 index 0000000000000000000000000000000000000000..07fc31d1fdc7fde1fa1bab570f3ba8b3d3d091e5 --- /dev/null +++ b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/tensorboard/valid/events.out.tfevents.1741091035.92b100c97f43.1159464.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba42604a3bafab883c8298950164c1807a12df8e055c1a9f01fa041210e647a8 +size 88 diff --git a/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.log b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.log new file mode 100644 index 0000000000000000000000000000000000000000..b23b5429edde80e1a8b052efa3d0126edd4c970a --- /dev/null +++ b/exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/train.log @@ -0,0 +1,985 @@ +# python3 -m espnet2.bin.gan_tts_train --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True +# Started at Tue Mar 4 21:23:38 JST 2025 +# +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/bin/python3 /work/espnet/espnet2/bin/gan_tts_train.py --use_preprocessor true --token_type phn --token_list dump/token_list/phn_jaconv_pyopenjtalk/tokens.txt --non_linguistic_symbols none --cleaner jaconv --g2p pyopenjtalk --normalize global_mvn --resume true --fold_length 150 --fold_length 240000 --output_dir exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk --config conf/tuning/train_jets.yaml --feats_extract fbank --feats_extract_conf n_fft=2048 --feats_extract_conf hop_length=300 --feats_extract_conf win_length=1200 --feats_extract_conf fs=24000 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/text,text,text --train_data_path_and_name_and_type dump/raw/jvs010_tr_no_dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/text_shape.phn --train_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/speech_shape --valid_data_path_and_name_and_type dump/raw/jvs010_dev/text,text,text --valid_data_path_and_name_and_type dump/raw/jvs010_dev/wav.scp,speech,sound --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn --valid_shape_file exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/speech_shape --pitch_extract_conf fs=24000 --pitch_extract_conf n_fft=2048 --pitch_extract_conf hop_length=300 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --pitch_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz --energy_extract_conf fs=24000 --energy_extract_conf n_fft=2048 --energy_extract_conf hop_length=300 --energy_extract_conf win_length=1200 --energy_normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz --normalize_conf stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz --ngpu 4 --multiprocessing_distributed True +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +[92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[92b100c97f43:0/4] 2025-03-04 21:23:44,840 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[92b100c97f43:0/4] 2025-03-04 21:23:44,896 (gan_tts:304) INFO: Vocabulary size: 41 +[92b100c97f43:0/4] 2025-03-04 21:23:45,027 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43:0/4] 2025-03-04 21:23:45,249 (encoder:172) INFO: encoder self-attention layer type = self-attention +[92b100c97f43:0/4] 2025-03-04 21:23:54,351 (abs_task:1157) INFO: pytorch.version=1.10.1+cu113, cuda.available=True, cudnn.version=8200, cudnn.benchmark=False, cudnn.deterministic=False +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1158) INFO: Model structure: +ESPnetGANTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=2048, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/feats_stats.npz, norm_means=True, norm_vars=True) + (pitch_extract): Dio() + (pitch_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/pitch_stats.npz, norm_means=True, norm_vars=True) + (energy_extract): Energy( + (stft): Stft(n_fft=2048, win_length=1200, hop_length=300, center=True, normalized=False, onesided=True) + ) + (energy_normalize): GlobalMVN(stats_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/train/energy_stats.npz, norm_means=True, norm_vars=True) + (tts): JETS( + (generator): JETSGenerator( + (encoder): Encoder( + (embed): Sequential( + (0): Embedding(41, 256, padding_idx=0) + (1): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (duration_predictor): DurationPredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.1, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (2): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (3): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(256, 256, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (pitch_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (energy_predictor): VariancePredictor( + (conv): ModuleList( + (0): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + (1): Sequential( + (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (1): ReLU() + (2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (3): Dropout(p=0.5, inplace=False) + ) + ) + (linear): Linear(in_features=256, out_features=1, bias=True) + ) + (energy_embed): Sequential( + (0): Conv1d(1, 256, kernel_size=(1,), stride=(1,)) + (1): Dropout(p=0.0, inplace=False) + ) + (alignment_module): AlignmentModule( + (t_conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (t_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (f_conv1): Conv1d(80, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (f_conv3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + ) + (length_regulator): GaussianUpsampling() + (decoder): Encoder( + (embed): Sequential( + (0): ScaledPositionalEncoding( + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + (feed_forward): MultiLayeredConv1d( + (w_1): Conv1d(256, 1024, kernel_size=(3,), stride=(1,), padding=(1,)) + (w_2): Conv1d(1024, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + (dropout): Dropout(p=0.2, inplace=False) + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.2, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (generator): HiFiGANGenerator( + (input_conv): Conv1d(256, 512, kernel_size=(7,), stride=(1,), padding=(3,)) + (upsamples): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(512, 256, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(256, 128, kernel_size=(16,), stride=(8,), padding=(4,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(128, 64, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + (3): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): ConvTranspose1d(64, 32, kernel_size=(4,), stride=(2,), padding=(1,)) + ) + ) + (blocks): ModuleList( + (0): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (1): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (2): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(256, 256, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (3): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (4): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (5): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(128, 128, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (6): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (7): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (8): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(64, 64, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + (9): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(5,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + (10): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(15,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(7,), stride=(1,), padding=(3,)) + ) + ) + ) + (11): ResidualBlock( + (convs1): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(15,), dilation=(3,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(25,), dilation=(5,)) + ) + ) + (convs2): ModuleList( + (0): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (1): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + (2): Sequential( + (0): LeakyReLU(negative_slope=0.1) + (1): Conv1d(32, 32, kernel_size=(11,), stride=(1,), padding=(5,)) + ) + ) + ) + ) + (output_conv): Sequential( + (0): LeakyReLU(negative_slope=0.01) + (1): Conv1d(32, 1, kernel_size=(7,), stride=(1,), padding=(3,)) + (2): Tanh() + ) + ) + ) + (discriminator): HiFiGANMultiScaleMultiPeriodDiscriminator( + (msd): HiFiGANMultiScaleDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANScaleDiscriminator( + (layers): ModuleList( + (0): Sequential( + (0): Conv1d(1, 128, kernel_size=(15,), stride=(1,), padding=(7,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv1d(128, 128, kernel_size=(41,), stride=(2,), padding=(20,), groups=4) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv1d(128, 256, kernel_size=(41,), stride=(2,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv1d(256, 512, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv1d(512, 1024, kernel_size=(41,), stride=(4,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (5): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(41,), stride=(1,), padding=(20,), groups=16) + (1): LeakyReLU(negative_slope=0.1) + ) + (6): Sequential( + (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(2,)) + (1): LeakyReLU(negative_slope=0.1) + ) + (7): Conv1d(1024, 1, kernel_size=(3,), stride=(1,), padding=(1,)) + ) + ) + ) + ) + (mpd): HiFiGANMultiPeriodDiscriminator( + (discriminators): ModuleList( + (0): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (1): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (2): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (3): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + (4): HiFiGANPeriodDiscriminator( + (convs): ModuleList( + (0): Sequential( + (0): Conv2d(1, 32, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (1): Sequential( + (0): Conv2d(32, 128, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (2): Sequential( + (0): Conv2d(128, 512, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (3): Sequential( + (0): Conv2d(512, 1024, kernel_size=(5, 1), stride=(3, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + (4): Sequential( + (0): Conv2d(1024, 1024, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0)) + (1): LeakyReLU(negative_slope=0.1) + ) + ) + (output_conv): Conv2d(1024, 1, kernel_size=(2, 1), stride=(1, 1), padding=(1, 0)) + ) + ) + ) + ) + (generator_adv_loss): GeneratorAdversarialLoss() + (discriminator_adv_loss): DiscriminatorAdversarialLoss() + (feat_match_loss): FeatureMatchLoss() + (mel_loss): MelSpectrogramLoss( + (wav_to_mel): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=24000, n_fft=1024, n_mels=80, fmin=0, fmax=12000.0, htk=False) + ) + ) + (var_loss): VarianceLoss( + (mse_criterion): MSELoss() + (duration_criterion): DurationPredictorLoss( + (criterion): MSELoss() + ) + ) + (forwardsum_loss): ForwardSumLoss() + ) +) + +Model summary: + Class Name: ESPnetGANTTSModel + Total Number of model parameters: 83.28 M + Number of trainable parameters: 83.28 M (100.0%) + Size: 333.11 MB + Type: torch.float32 +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler: +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1161) INFO: Optimizer2: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.8, 0.99] + eps: 1e-09 + initial_lr: 0.0002 + lr: 0.0002 + weight_decay: 0.0 +) +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1162) INFO: Scheduler2: +[92b100c97f43:0/4] 2025-03-04 21:23:54,361 (abs_task:1171) INFO: Saving the configuration in exp/tts_train_jets_raw_phn_jaconv_pyopenjtalk/config.yaml +[92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1525) INFO: [train] dataset: +ESPnetDataset( + text: {"path": "dump/raw/jvs010_tr_no_dev/text", "type": "text"} + speech: {"path": "dump/raw/jvs010_tr_no_dev/wav.scp", "type": "sound"} + preprocess: ) +[92b100c97f43:0/4] 2025-03-04 21:23:54,698 (abs_task:1526) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=4, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending) +[92b100c97f43:0/4] 2025-03-04 21:23:54,699 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=4, mean=25.0, min=5, max=41 +[92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1525) INFO: [valid] dataset: +ESPnetDataset( + text: {"path": "dump/raw/jvs010_dev/text", "type": "text"} + speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"} + preprocess: ) +[92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1526) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=1, batch_bins=6000000, sort_in_batch=descending, sort_batch=descending) +[92b100c97f43:0/4] 2025-03-04 21:23:54,719 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=1, mean=15.0, min=15, max=15 +[92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1525) INFO: [plot_att] dataset: +ESPnetDataset( + text: {"path": "dump/raw/jvs010_dev/text", "type": "text"} + speech: {"path": "dump/raw/jvs010_dev/wav.scp", "type": "sound"} + preprocess: ) +[92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15, batch_size=1, key_file=exp/tts_stats_raw_phn_jaconv_pyopenjtalk/valid/text_shape.phn, +[92b100c97f43:0/4] 2025-03-04 21:23:54,739 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +92b100c97f43:1159464:1159464 [0] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0> +92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +92b100c97f43:1159464:1159464 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] +92b100c97f43:1159464:1159464 [0] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0> +92b100c97f43:1159464:1159464 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.3 +92b100c97f43:1159466:1159466 [2] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0> +92b100c97f43:1159465:1159465 [1] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0> +92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +92b100c97f43:1159466:1159466 [2] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] + +92b100c97f43:1159465:1159465 [1] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] +92b100c97f43:1159465:1159465 [1] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0> +92b100c97f43:1159466:1159466 [2] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0> +92b100c97f43:1159465:1159465 [1] NCCL INFO Using network Socket +92b100c97f43:1159466:1159466 [2] NCCL INFO Using network Socket +92b100c97f43:1159467:1159467 [3] NCCL INFO Bootstrap : Using eth0:172.17.0.2<0> +92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +92b100c97f43:1159467:1159467 [3] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1] +92b100c97f43:1159467:1159467 [3] NCCL INFO NET/Socket : Using [0]eth0:172.17.0.2<0> +92b100c97f43:1159467:1159467 [3] NCCL INFO Using network Socket +92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +92b100c97f43:1159465:1159504 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +92b100c97f43:1159467:1159505 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +92b100c97f43:1159466:1159503 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +92b100c97f43:1159465:1159504 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff +92b100c97f43:1159464:1159502 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +92b100c97f43:1159467:1159505 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff +92b100c97f43:1159466:1159503 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff +92b100c97f43:1159464:1159502 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff +92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 3[60] via direct shared memory +92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 0[30] via direct shared memory +92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 00 : 0[30] -> 1[40] via direct shared memory +92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 2[50] via direct shared memory +92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 3[60] via direct shared memory +92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 0[30] via direct shared memory +92b100c97f43:1159464:1159502 [0] NCCL INFO Channel 01 : 0[30] -> 1[40] via direct shared memory +92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 2[50] via direct shared memory +92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all rings +92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all rings +92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all rings +92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all rings +92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 00 : 3[60] -> 2[50] via direct shared memory +92b100c97f43:1159467:1159505 [3] NCCL INFO Channel 01 : 3[60] -> 2[50] via direct shared memory +92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 00 : 2[50] -> 1[40] via direct shared memory +92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 00 : 1[40] -> 0[30] via direct shared memory +92b100c97f43:1159466:1159503 [2] NCCL INFO Channel 01 : 2[50] -> 1[40] via direct shared memory +92b100c97f43:1159465:1159504 [1] NCCL INFO Channel 01 : 1[40] -> 0[30] via direct shared memory +92b100c97f43:1159464:1159502 [0] NCCL INFO Connected all trees +92b100c97f43:1159464:1159502 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +92b100c97f43:1159464:1159502 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +92b100c97f43:1159467:1159505 [3] NCCL INFO Connected all trees +92b100c97f43:1159467:1159505 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +92b100c97f43:1159467:1159505 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +92b100c97f43:1159465:1159504 [1] NCCL INFO Connected all trees +92b100c97f43:1159465:1159504 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +92b100c97f43:1159465:1159504 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +92b100c97f43:1159466:1159503 [2] NCCL INFO Connected all trees +92b100c97f43:1159466:1159503 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +92b100c97f43:1159466:1159503 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +92b100c97f43:1159466:1159503 [2] NCCL INFO comm 0x7f35f80030d0 rank 2 nranks 4 cudaDev 2 busId 50 - Init COMPLETE +92b100c97f43:1159464:1159502 [0] NCCL INFO comm 0x7f55500030d0 rank 0 nranks 4 cudaDev 0 busId 30 - Init COMPLETE +92b100c97f43:1159464:1159464 [0] NCCL INFO Launch mode Parallel +92b100c97f43:1159465:1159504 [1] NCCL INFO comm 0x7f97600030d0 rank 1 nranks 4 cudaDev 1 busId 40 - Init COMPLETE +92b100c97f43:1159467:1159505 [3] NCCL INFO comm 0x7f66b80030d0 rank 3 nranks 4 cudaDev 3 busId 60 - Init COMPLETE +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +[92b100c97f43:0/4] 2025-03-04 21:23:55,188 (trainer:280) INFO: 1/130epoch started +/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.2.3) or chardet (3.0.4) doesn't match a supported version! + warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported " +[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +[W reducer.cpp:1303] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +/work/espnet/espnet2/layers/stft.py:166: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). + olens = (ilens - self.n_fft) // self.hop_length + 1 +[92b100c97f43:0/4] 2025-03-04 21:25:36,523 (gan_trainer:305) INFO: 1epoch:train:1-50batch: iter_time=0.048, generator_forward_time=0.723, generator_loss=139.743, generator_g_loss=110.582, generator_var_loss=5.224, generator_align_loss=23.937, generator_g_mel_loss=106.758, generator_g_adv_loss=2.179, generator_g_feat_match_loss=1.645, generator_var_dur_loss=0.584, generator_var_pitch_loss=2.400, generator_var_energy_loss=2.240, generator_align_forwardsum_loss=10.599, generator_align_bin_loss=1.369, generator_backward_time=0.254, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.113, discriminator_forward_time=0.544, discriminator_loss=2.766, discriminator_real_loss=1.518, discriminator_fake_loss=1.247, discriminator_backward_time=0.198, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.788, train_time=2.024 +[92b100c97f43:0/4] 2025-03-04 21:27:08,245 (gan_trainer:305) INFO: 1epoch:train:51-100batch: iter_time=1.198e-04, generator_forward_time=0.634, generator_loss=111.648, generator_g_loss=85.935, generator_var_loss=2.179, generator_align_loss=23.534, generator_g_mel_loss=80.251, generator_g_adv_loss=2.332, generator_g_feat_match_loss=3.352, generator_var_dur_loss=0.089, generator_var_pitch_loss=0.924, generator_var_energy_loss=1.166, generator_align_forwardsum_loss=10.437, generator_align_bin_loss=1.330, generator_backward_time=0.258, generator_optim_step_time=0.034, optim0_lr0=2.000e-04, generator_train_time=1.027, discriminator_forward_time=0.548, discriminator_loss=2.396, discriminator_real_loss=1.381, discriminator_fake_loss=1.015, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.796, train_time=1.835 +[92b100c97f43:0/4] 2025-03-04 21:28:38,897 (gan_trainer:305) INFO: 1epoch:train:101-150batch: iter_time=1.203e-04, generator_forward_time=0.624, generator_loss=112.406, generator_g_loss=87.597, generator_var_loss=1.890, generator_align_loss=22.919, generator_g_mel_loss=80.508, generator_g_adv_loss=2.744, generator_g_feat_match_loss=4.346, generator_var_dur_loss=0.058, generator_var_pitch_loss=0.808, generator_var_energy_loss=1.024, generator_align_forwardsum_loss=10.071, generator_align_bin_loss=1.389, generator_backward_time=0.257, generator_optim_step_time=0.033, optim0_lr0=2.000e-04, generator_train_time=1.015, discriminator_forward_time=0.539, discriminator_loss=2.084, discriminator_real_loss=1.319, discriminator_fake_loss=0.765, discriminator_backward_time=0.201, discriminator_optim_step_time=0.009, optim1_lr0=2.000e-04, discriminator_train_time=0.787, train_time=1.813