diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.1.log b/exp/tts_stats_raw_phn_none/logdir/stats.1.log new file mode 100644 index 0000000000000000000000000000000000000000..e5df87d65d9f42dd64ba3e546981bfcd47599481 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.1.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:11 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:09:21,971 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:09:22,770 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.1/config.yaml +[7850374a3496] 2023-07-13 14:09:22,799 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.1', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.1.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.1.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=16 threads=1 +# Ended (code 0) at Thu Jul 13 14:09:27 UTC 2023, elapsed time 16 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de3940a359091a38d758b6442270b477e15f70b1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.10 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.10.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.10.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..30d842df3c7efed9c353e45d3ab0168b6fe4ddd6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6b9f2fd6232f4b0ca33457b5d22c02d2b17b34d24e2f9f1f2415b0ec8a15f0 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7b35473a2b5338d1e7ad61e6f68190392314c6e2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:057d10f13786abd5b7b6b90bea854b18ad227d34f19bb8092c488f864880dd51 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..bf3077b35d83bc6860354637a2538337c2de44d7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape @@ -0,0 +1,43 @@ +18935 142336 +18936 141568 +18943 175360 +18944 173824 +18947 190208 +18951 154368 +18955 233216 +18959 226560 +18964 163584 +18982 113664 +18989 163072 +18991 212480 +18993 175872 +18997 101888 +19 122880 +19001 217088 +19005 184832 +19010 156928 +19011 175872 +19015 139520 +19024 165888 +19028 158720 +19063 187136 +19065 144128 +19067 175616 +19075 163584 +19076 214784 +19090 172544 +19091 199936 +19095 118016 +19096 165888 +19099 159488 +191 134144 +19103 124416 +19109 132352 +19111 151740 +19113 129280 +19116 155648 +19118 174336 +19121 137472 +19122 144896 +19132 131072 +19138 135936 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..7ded8c9286d618c62cb753d9d619efb57a6f2822 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape @@ -0,0 +1,43 @@ +18935 66 +18936 77 +18943 94 +18944 90 +18947 93 +18951 66 +18955 116 +18959 120 +18964 81 +18982 54 +18989 85 +18991 114 +18993 100 +18997 45 +19 58 +19001 132 +19005 97 +19010 82 +19011 97 +19015 72 +19024 90 +19028 71 +19063 115 +19065 84 +19067 83 +19075 78 +19076 112 +19090 92 +19091 108 +19095 62 +19096 89 +19099 87 +191 70 +19103 68 +19109 75 +19111 80 +19113 45 +19116 87 +19118 97 +19121 74 +19122 87 +19132 69 +19138 75 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..2addf17918372f05208847df78ce5a326e5abebf --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe87f28f6100dafb92cda513225e57bd983e4483dbefd895ad65790398958c0 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7027536e5c61c73ca18e72488df11124bf266ffd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c23bf05ba35b7d316b51347290281e31e36aca870887098c995fd8f5c860508 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3e26db37caadf20a5ce96147582e80c8ddee7a78 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape @@ -0,0 +1,2 @@ +169 189952 +18237 234496 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..2e3c874e113ef77d31a2d1fcc36fa39f2bcb6659 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape @@ -0,0 +1,2 @@ +169 104 +18237 134 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.11.log b/exp/tts_stats_raw_phn_none/logdir/stats.11.log new file mode 100644 index 0000000000000000000000000000000000000000..3a7b29dacc1d42d4501cd21c17037bb0ae4dfcd3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.11.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:19 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:27,026 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:27,731 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.11/config.yaml +[7850374a3496] 2023-07-13 14:10:27,761 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.11', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.11.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.11.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:31 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12.log b/exp/tts_stats_raw_phn_none/logdir/stats.12.log new file mode 100644 index 0000000000000000000000000000000000000000..db82b864f0b435d41db775df20d66427400151d5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:19 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:27,287 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:27,998 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml +[7850374a3496] 2023-07-13 14:10:28,024 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.12', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.12.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.12.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:32 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6bce568ddddfb825567857612c0e3cb174f2559e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.12 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.12.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.12.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..dfe4fbe80077655b65443b6a93d0ad1cd2c426e1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d49bfea4033ce1e51b1a17d023326b9c8fc5b58658ad92a4ab13fae6f7b8d624 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..085fde983606545e7177c2c25beb825f829e2490 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c6bcb5409152fe06dfbb367a0d796774c6f1e94af5fa448137f8a901f9c284 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3b285379f04fb1dc48aad770682243a68253e632 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape @@ -0,0 +1,43 @@ +19360 171776 +19366 143104 +19367 199936 +19371 145920 +19372 162816 +19374 145664 +19376 201682 +19387 219904 +19396 130048 +19399 112896 +194 140032 +19400 183808 +19404 159488 +19406 186624 +19410 183552 +19413 121088 +19414 134912 +19423 198400 +19429 195328 +19439 114944 +19440 97280 +19449 159488 +19451 140032 +19454 120320 +19477 191488 +19482 157696 +19488 169472 +19496 129792 +19499 153344 +195 122624 +19501 137216 +19506 162816 +19509 143872 +19510 119040 +19511 146688 +19521 132864 +19522 167680 +19524 146944 +19529 188928 +19540 193536 +19542 179456 +19543 159669 +19548 138752 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..976159879a5825de98a7253c642e5c67b6c7b8d8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape @@ -0,0 +1,43 @@ +19360 89 +19366 85 +19367 92 +19371 64 +19372 84 +19374 75 +19376 103 +19387 108 +19396 57 +19399 54 +194 62 +19400 92 +19404 99 +19406 103 +19410 102 +19413 67 +19414 60 +19423 90 +19429 90 +19439 51 +19440 33 +19449 92 +19451 67 +19454 61 +19477 97 +19482 84 +19488 93 +19496 63 +19499 65 +195 68 +19501 60 +19506 96 +19509 80 +19510 57 +19511 77 +19521 61 +19522 80 +19524 67 +19529 91 +19540 101 +19542 107 +19543 78 +19548 61 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e9ba6e207a7e35f7028fd8d097eee467a153284e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f504865400934158e457d1520e847de7701d6bd8479c772a7d9710d35616c234 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c9238294bb224d251164e3dddf11751f6c3d49c3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51eb4c28274a7e81976f604f330f3c2f10cc6cf6a8befcf261e3e49cbdd44ab0 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4df547d109d95da1851197e8fb87c087d8465a42 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape @@ -0,0 +1,2 @@ +18963 129280 +19178 177408 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..22f9e2ed98dad9e01f7db1b2b7295ccad861503a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape @@ -0,0 +1,2 @@ +18963 58 +19178 91 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.13.log b/exp/tts_stats_raw_phn_none/logdir/stats.13.log new file mode 100644 index 0000000000000000000000000000000000000000..ef6f0452ebdc73ce41b74685a87c3a99e597ea57 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.13.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:31 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:40,144 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:40,861 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.13/config.yaml +[7850374a3496] 2023-07-13 14:10:40,891 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.13', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.13.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.13.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:44 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.14.log b/exp/tts_stats_raw_phn_none/logdir/stats.14.log new file mode 100644 index 0000000000000000000000000000000000000000..3076226abc482a4802d8d6f5194b378cad543aff --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.14.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:32 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:40,319 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:41,034 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.14/config.yaml +[7850374a3496] 2023-07-13 14:10:41,061 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.14', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.14.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.14.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:45 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15.log b/exp/tts_stats_raw_phn_none/logdir/stats.15.log new file mode 100644 index 0000000000000000000000000000000000000000..afb178074a95edde2c4753dd31a702587461e1e5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:44 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:52,286 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:52,979 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml +[7850374a3496] 2023-07-13 14:10:53,009 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.15', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.15.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.15.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:56 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ffd3e20551b749cbc1ece1d9f4d0b4568489ec1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.15 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.15.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.15.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4482e0d248ed2f285e219d77069a783df1785267 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7321393931081a400396aafc1edb9605c0808638ac13716f0f23942f51e167a +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8bcda75be06bb85fc29c87834c37cc0b9439c311 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:239fc3342b13b16bd839d1ac8b21666aad2e96b1b2341e9c6c4c53e063f99526 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..7516e5602e2c4b5a8b1aeaefc09963309f7e204a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape @@ -0,0 +1,43 @@ +19928 141056 +19931 133376 +19935 203520 +19938 102144 +19944 126464 +19946 116992 +19947 154112 +19948 171637 +19949 141056 +19951 214272 +19952 165376 +19955 134912 +19957 150596 +19959 176896 +19976 169472 +19979 119808 +19981 134144 +19984 171520 +19990 235008 +19998 195840 +200 125440 +20001 184576 +20005 108032 +20020 164608 +20022 235264 +20029 174080 +20038 216576 +20042 241920 +20051 203776 +20055 168448 +20062 152064 +20080 219136 +20087 116992 +20095 193792 +201 119040 +20109 167424 +20119 149760 +20120 154368 +20121 172288 +20128 143872 +20144 112128 +20147 167168 +20183 139520 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..1b0480af516ee03d6d3e845b9cc263c15ce30ad2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape @@ -0,0 +1,43 @@ +19928 71 +19931 84 +19935 117 +19938 58 +19944 71 +19946 52 +19947 71 +19948 80 +19949 64 +19951 133 +19952 102 +19955 64 +19957 81 +19959 106 +19976 97 +19979 66 +19981 66 +19984 80 +19990 127 +19998 100 +200 64 +20001 98 +20005 61 +20020 68 +20022 143 +20029 103 +20038 123 +20042 136 +20051 106 +20055 97 +20062 90 +20080 124 +20087 52 +20095 101 +201 67 +20109 82 +20119 64 +20120 93 +20121 83 +20128 67 +20144 69 +20147 78 +20183 70 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..fc52709fb2133de4dec4e04b188a3ad1635e87f2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ab01415ef2a97eaa04e81355080bc38b3f9b0343f8e97e91044090b6ff63685 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ebeabaa534cdcaf8914e54ecf8404caf91dda3e9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89cf49faa040cf392b87903167b64d4599f1d322f1a2937c91823fca48e139a9 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..a32150e06c7896ef7cfec666068e8411dcb9ae4e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape @@ -0,0 +1,2 @@ +19769 152064 +19771 194816 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..060a4c940a21d4526d8a4174f184a5bfcb489101 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape @@ -0,0 +1,2 @@ +19769 84 +19771 108 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17.log b/exp/tts_stats_raw_phn_none/logdir/stats.17.log new file mode 100644 index 0000000000000000000000000000000000000000..2f58dde7777e4a99f2cd924f3755efc1dc754f4c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:56 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:04,338 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:05,061 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:05,065 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml +[7850374a3496] 2023-07-13 14:11:05,100 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.17', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.17.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.17.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:09 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..377cb9d90d0543abe41666927cbc986e6e73dda4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.17 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.17.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.17.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f1ba54411e1845995b8d136ad7bc54a0e8c2ffd2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:503b8c916b78f4942fd868b9337455d0a4593217bf37efc50c5e8192e7949a22 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1283874015b52135054333f7374b4f78294ecb78 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:242283537138ddfc5699bdd17945a6d6bf4a95ff1d368666989414d0b47ca626 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..9eed91e9162efda276e2163589a8fcec19a7f60c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/speech_shape @@ -0,0 +1,43 @@ +20396 101120 +20397 145920 +20399 184998 +204 172800 +20402 146688 +20406 152832 +20408 90112 +20410 104448 +20413 85248 +20422 138752 +20427 174848 +20435 162816 +20438 158208 +20440 123648 +20442 184715 +20445 159232 +20447 106496 +20461 272896 +20464 187904 +20465 173056 +20482 133376 +20484 178944 +20488 192768 +20489 157952 +20495 154880 +20496 141056 +205 124672 +20503 169216 +20504 202496 +20512 149248 +20513 167936 +20516 112128 +20517 131328 +20520 216320 +20523 153088 +20524 145152 +20532 243200 +20535 122880 +20547 169472 +20549 176640 +20554 97792 +20555 127100 +20558 230912 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f73ae6ea570faab3f441220f454243ad6b5712bd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/train/text_shape @@ -0,0 +1,43 @@ +20396 44 +20397 69 +20399 103 +204 84 +20402 82 +20406 77 +20408 39 +20410 53 +20413 49 +20422 77 +20427 97 +20435 101 +20438 97 +20440 56 +20442 108 +20445 89 +20447 58 +20461 156 +20464 94 +20465 87 +20482 62 +20484 86 +20488 115 +20489 86 +20495 71 +20496 84 +205 71 +20503 97 +20504 97 +20512 86 +20513 75 +20516 48 +20517 56 +20520 113 +20523 67 +20524 72 +20532 124 +20535 57 +20547 73 +20549 75 +20554 48 +20555 72 +20558 114 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..810a6dd0a9b07b19e8d2677dfab6fd2882a681de --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4858d46450d0e9367c15edcbd72c365a9442642406e1a45bff015624839649fb +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c5093d4a82fe2bc9f58ef14c7a6712c634fe611d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a13ce0f0b5aa04d4bd0070b5a32a22dacc5962ac924b3aee213b34a10998c12 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..beb9795a9b67048169a2c8cf84aeab32ae9eb6d9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/speech_shape @@ -0,0 +1,2 @@ +20265 127232 +20613 128000 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..3a13426336be10fbfb62be5d788effa53fa7cd6f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.17/valid/text_shape @@ -0,0 +1,2 @@ +20265 62 +20613 60 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18.log b/exp/tts_stats_raw_phn_none/logdir/stats.18.log new file mode 100644 index 0000000000000000000000000000000000000000..938f1befedc95facf118b5240255bebfe9cc488d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.18.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.18.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.18 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:57 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.18.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.18.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.18 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:04,537 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:05,256 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:05,259 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:05,259 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:05,259 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:05,259 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.18/config.yaml +[7850374a3496] 2023-07-13 14:11:05,282 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.18', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.18.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.18.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:09 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.18/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be2fe86934132bfa2fec9ebe9df3f4aea1d3d7ad --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.18 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.18.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.18.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8795a381dd58c5a7b34282d57cc55dd4a3775d76 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52f599ba68bafeeb4a8e2092854acf288f561184f0f5a5555a6abbfd0fa4984a +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..29b85f42369ffaa90b6a6a3b50ec5ae8d8ed6f7d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1ebecf780f8e2e69c5e17a56b8af606a871c98073aea0ff1350727caae2f368 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..cdbc3351f207bf92a0ee8c07da44539446778af4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/speech_shape @@ -0,0 +1,43 @@ +20561 246272 +20575 118784 +20576 142592 +20582 173056 +20584 170752 +20587 176384 +20589 164864 +20597 161792 +206 169472 +20604 192000 +20605 161280 +20606 153600 +20607 131072 +20616 141568 +20617 170752 +20621 168704 +20622 169984 +20629 164096 +20633 119552 +20635 141312 +20643 150528 +20644 158720 +20645 146944 +20663 215296 +20665 101376 +20668 179200 +20673 140288 +20674 123904 +20675 126464 +20679 157184 +20698 146176 +20699 156672 +207 188672 +20702 198144 +20708 192256 +20709 163840 +20710 169984 +20714 189696 +20729 136448 +20739 119040 +20740 171776 +20751 106496 +20754 185856 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..2b86d0e800d2ead255d157564f715c1f9de053ab --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/train/text_shape @@ -0,0 +1,43 @@ +20561 140 +20575 44 +20576 69 +20582 96 +20584 88 +20587 79 +20589 83 +20597 87 +206 90 +20604 103 +20605 88 +20606 86 +20607 77 +20616 68 +20617 89 +20621 96 +20622 89 +20629 99 +20633 49 +20635 66 +20643 90 +20644 92 +20645 72 +20663 130 +20665 45 +20668 93 +20673 69 +20674 72 +20675 73 +20679 86 +20698 74 +20699 84 +207 98 +20702 120 +20708 101 +20709 78 +20710 93 +20714 97 +20729 80 +20739 56 +20740 98 +20751 48 +20754 116 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..472369ae3c7b44e4890c99fa26a0dc20ce5445da --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ebb7e26b5f1de7e3dea14e3b4c9f7cb4a63e62ce9c63f6cd2b8d857575ba7e9 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c44efc1b0e073b3956c0d096f1cb3aa4e133b62e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ff769eae81ea75bea863efb91347fe250cf221f44ed363ce87dc7b71c22d02 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..1ffe0b6a38e8888ab869b12e188ef3c45e081800 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/speech_shape @@ -0,0 +1,2 @@ +20642 164352 +20701 131072 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..e627750981b518c98647d0dbcdddd6461c59e34f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.18/valid/text_shape @@ -0,0 +1,2 @@ +20642 78 +20701 64 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19.log b/exp/tts_stats_raw_phn_none/logdir/stats.19.log new file mode 100644 index 0000000000000000000000000000000000000000..ec89d0de7e2626c6a141af03d3543b312ad37210 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.19.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.19.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.19 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:09 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.19.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.19.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.19 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:17,332 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:18,056 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:18,059 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:18,059 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:18,059 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:18,060 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.19/config.yaml +[7850374a3496] 2023-07-13 14:11:18,086 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.19', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.19.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.19.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:22 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.19/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0cdd3bf70a111e33fa3be6dc5cfac3d2ba1a4ff2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.19 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.19.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.19.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e2c00b66baf5605fa5b29b56441ee487056f58ed --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d909ca2b1016b9e2de5f7519f6e7f888bcd43ad3a90fa94e82bd0e9bfa0d9c6e +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ccfc1336e07d5d49b2b001bf98da1e17fbe6f7e1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e7a5f420434cf83646599a2be6ec8e9b938ddb193352477e19cdeebd1c508d +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..c6998193d16ecdf8bfdf7740047bd642b9b15158 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/speech_shape @@ -0,0 +1,43 @@ +20755 140544 +208 235776 +209 127744 +20913 151040 +20914 249344 +20923 160000 +20926 179200 +20928 190464 +20935 124928 +20954 287744 +20955 193792 +20959 159232 +20961 192256 +20982 189696 +20988 114688 +20992 158720 +21002 201984 +21005 273152 +21017 197376 +21020 150272 +21022 200704 +21028 141568 +21030 172544 +21033 156160 +21038 102400 +21043 175616 +21045 93952 +21055 187648 +21061 178944 +21072 152320 +21075 166656 +21082 130304 +21084 162304 +21085 154368 +21088 207104 +21097 187392 +21098 152064 +211 152320 +21100 204032 +21101 217600 +21103 156160 +21107 138240 +21109 171264 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..d7f4ef26bfe307e8e8f62cf0a42a7fc3e9059d54 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/train/text_shape @@ -0,0 +1,43 @@ +20755 80 +208 112 +209 74 +20913 76 +20914 130 +20923 74 +20926 106 +20928 77 +20935 60 +20954 145 +20955 104 +20959 85 +20961 83 +20982 100 +20988 67 +20992 87 +21002 120 +21005 148 +21017 105 +21020 68 +21022 123 +21028 65 +21030 88 +21033 79 +21038 54 +21043 100 +21045 48 +21055 111 +21061 96 +21072 81 +21075 86 +21082 72 +21084 73 +21085 76 +21088 103 +21097 96 +21098 82 +211 70 +21100 126 +21101 145 +21103 78 +21107 74 +21109 85 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8e0ce1ba4d5621c10f02a9b3a2f1f664b170a640 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14528e9d38ea5668f31cedba1d1e1ca77154172b93f9dffb7553a7614c6aea6b +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b096608d68815ba650c894394a6ccc57f5ce287b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc5babfad4af8072cc7cab50a13b57b3bc8d7233c55e493467dccb9352729b8e +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..182e23fbd82f131e4de191834c49a831807b9977 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/speech_shape @@ -0,0 +1,2 @@ +21 159232 +21440 155648 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..14b6b047f5a2889aa84196b1b3311c6255bffdde --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.19/valid/text_shape @@ -0,0 +1,2 @@ +21 82 +21440 74 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2.log b/exp/tts_stats_raw_phn_none/logdir/stats.2.log new file mode 100644 index 0000000000000000000000000000000000000000..8e9a8f6c8265cd44f18e856820faa42efe6f4399 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.2 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:11 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.2.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.2.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.2 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:09:21,968 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:09:22,770 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:09:22,774 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.2/config.yaml +[7850374a3496] 2023-07-13 14:09:22,801 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.2', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.2.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.2.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=16 threads=1 +# Ended (code 0) at Thu Jul 13 14:09:27 UTC 2023, elapsed time 16 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.2/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..42a4371db4655eb09a8d34ba5d85eb9f6b886565 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.2 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.2.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.2.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4f9381e93e927cdddf45dea7222ccee84cc3b58f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3fe77c4dc2ed1c1e341f8b4546ff57e5224af61713fd327a08bcf86e3a83ef +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..dfed8523e211b11e506110057357d5f69ddfd66c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743f3810867689e312d613b184c0e018bdacb7a8e2627200d862138356aa223d +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..12a1b41900d1aba9c026075f2807df8a68a21a86 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/speech_shape @@ -0,0 +1,44 @@ +14626 178432 +14630 170240 +14631 216320 +14632 282880 +14636 140288 +14643 97792 +147 88064 +14731 255232 +14732 157440 +14733 112896 +14736 131584 +14737 178688 +14739 207872 +14757 292096 +14761 113408 +14763 177408 +14764 191744 +14780 159232 +14781 178176 +14783 182272 +14784 171264 +14786 187904 +14799 185344 +148 171776 +14802 253952 +14813 216576 +14818 191488 +14820 159744 +14822 183040 +14840 229506 +14841 137472 +14845 228608 +14856 261632 +14858 217600 +14865 139520 +14876 166912 +14878 188672 +14879 174080 +14880 295936 +14881 187648 +14882 217088 +14883 186880 +14890 179712 +14892 278272 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..83acb74c2d9b50bf1d9ae01fa3344d7b74529638 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/train/text_shape @@ -0,0 +1,44 @@ +14626 82 +14630 79 +14631 104 +14632 133 +14636 71 +14643 47 +147 48 +14731 121 +14732 76 +14733 64 +14736 57 +14737 95 +14739 102 +14757 134 +14761 54 +14763 105 +14764 103 +14780 89 +14781 98 +14783 88 +14784 84 +14786 101 +14799 102 +148 83 +14802 137 +14813 97 +14818 112 +14820 79 +14822 105 +14840 130 +14841 81 +14845 142 +14856 132 +14858 107 +14865 64 +14876 81 +14878 96 +14879 96 +14880 156 +14881 110 +14882 132 +14883 97 +14890 100 +14892 178 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..5950c8360ffd3491ec70b7613bc2676f0ee30138 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e06b8e0ccdd494295d16549550b75be2a3640f791294be8e279eabcf3e6db03 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c3ba26fd25384f803766348c1cfe57375a1f027b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1538a8d070f9c0917118cae33e4cd40c8dfe2d87e1bab387c821943dd42b4409 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..75b2ffc0782e28f4cb646812810c2628c6fdf7e0 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/speech_shape @@ -0,0 +1,2 @@ +123 181760 +125 254720 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..56a03d389fdbc906265b14492c7a3486898aab2f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.2/valid/text_shape @@ -0,0 +1,2 @@ +123 96 +125 117 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20.log b/exp/tts_stats_raw_phn_none/logdir/stats.20.log new file mode 100644 index 0000000000000000000000000000000000000000..0a3c02630077594b276dc27ed12ae6fc2a519ce7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.20.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.20.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.20 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:09 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.20.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.20.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.20 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:17,397 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:18,141 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:18,144 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:18,144 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:18,144 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:18,144 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.20/config.yaml +[7850374a3496] 2023-07-13 14:11:18,169 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.20', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.20.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.20.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:22 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.20/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..59a6195302e1f76cf653ffbcb8572a2fb4952acf --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.20 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.20.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.20.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4ead3c053a6b8b6a273913eefc42b4c510a9e801 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ffdc918ea99bea99aadf775dfd9888232762d2389cf47103c3dba30d97631d6 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1238b84535363ad50843a52d425e30c9d67843e6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f10d47f55837d802604b37715be25db607a0b38931feda104c473286fedf071 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..08bfaffb3a22552f7987d8617125b22991ce6aa4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/speech_shape @@ -0,0 +1,43 @@ +21113 141312 +21115 124928 +21129 194816 +21139 219392 +21143 148736 +21144 197888 +21147 139520 +21148 147200 +21158 156160 +21165 200448 +21170 154880 +21175 182528 +21176 139264 +21178 123904 +21192 178176 +21193 121088 +21199 142592 +212 126976 +21201 158208 +21210 144896 +21214 154624 +21228 154624 +21235 212224 +21239 207616 +21248 198656 +21252 178432 +21266 185344 +21269 163328 +21270 161536 +21271 159488 +21282 216320 +21287 149760 +21290 114944 +213 157952 +21307 129024 +21308 159744 +21309 170752 +21310 182528 +21312 175104 +21348 143616 +21349 126208 +21362 223232 +21363 226048 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..953a252aee13c6385df9438f350d8647d8f6ef00 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/train/text_shape @@ -0,0 +1,43 @@ +21113 72 +21115 63 +21129 98 +21139 100 +21143 85 +21144 93 +21147 74 +21148 65 +21158 76 +21165 110 +21170 83 +21175 92 +21176 62 +21178 66 +21192 93 +21193 63 +21199 65 +212 68 +21201 89 +21210 87 +21214 89 +21228 83 +21235 118 +21239 121 +21248 90 +21252 115 +21266 95 +21269 103 +21270 88 +21271 90 +21282 129 +21287 63 +21290 50 +213 82 +21307 82 +21308 101 +21309 97 +21310 105 +21312 93 +21348 74 +21349 56 +21362 114 +21363 124 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..927b8257b683d094360f49a9dc760da5d6172d2c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:582736fdc58edadd369c0a20bfc69e0842fc4d8b4d70da071f22c9d15c6fb198 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..0acbbc86bd24b42071e12d58c68d171838248960 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1c18db9892df647ab461160d4ae2003af7bf90a8cfcd278d5e11df22274780 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..00d32b4431235f80ca7cb9c4b0f5ac73a8c4934c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/speech_shape @@ -0,0 +1,2 @@ +21499 114944 +21601 180224 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..edc040a028be21696a62da02d35a05783131aa9e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.20/valid/text_shape @@ -0,0 +1,2 @@ +21499 54 +21601 83 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21.log b/exp/tts_stats_raw_phn_none/logdir/stats.21.log new file mode 100644 index 0000000000000000000000000000000000000000..0e565fb2596ea1ddde112be0a6b6c236fa897751 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.21.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.21.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.21 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:22 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.21.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.21.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.21 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:29,834 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:30,576 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:30,579 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:30,579 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:30,579 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:30,580 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.21/config.yaml +[7850374a3496] 2023-07-13 14:11:30,602 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.21', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.21.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.21.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:34 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.21/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a712cf1ca01b9f9f8a125d82097f011f8d44596a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.21 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.21.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.21.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..43c0e3ba0933f79eb4b9ba475f4f71eba5e3b128 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ec2c44202275dd1a0871a093640a9e0326482464cbd822528225252ffcb90f +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..261122ade8be03afd16b0b1f9185233a3bc3d79f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f4349fa7fc59478d99ddb99522e46d5935b7f1ee85fb1ca20cdabf35d16e6a +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..892fef81f1c12196c9ca42e3e8a051c8cac61311 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/speech_shape @@ -0,0 +1,43 @@ +21366 185088 +21368 133632 +21372 193792 +21375 141312 +21379 148992 +21382 143104 +21385 175616 +21388 133632 +21397 121600 +214 112640 +21401 217600 +21402 162304 +21403 138240 +21411 150272 +21412 185856 +21414 187904 +21420 177076 +21422 159232 +21424 177920 +21425 131840 +21433 194048 +21447 176384 +21449 159232 +21475 168192 +21479 169216 +21481 91136 +21492 144640 +21498 145152 +21500 178432 +21502 176384 +21503 204288 +21505 161024 +21511 140800 +21515 246528 +21517 182784 +21521 139776 +21525 198400 +21526 122368 +21536 110592 +21539 181248 +21540 103936 +21544 144640 +21546 135424 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..d35be30ddba42fb3e4566f40f836bb4b41a9dd40 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/train/text_shape @@ -0,0 +1,43 @@ +21366 85 +21368 40 +21372 64 +21375 64 +21379 76 +21382 74 +21385 82 +21388 74 +21397 62 +214 52 +21401 121 +21402 89 +21403 68 +21411 70 +21412 97 +21414 97 +21420 84 +21422 69 +21424 106 +21425 77 +21433 104 +21447 86 +21449 91 +21475 97 +21479 101 +21481 39 +21492 73 +21498 79 +21500 82 +21502 90 +21503 100 +21505 83 +21511 58 +21515 127 +21517 82 +21521 74 +21525 107 +21526 61 +21536 54 +21539 96 +21540 45 +21544 75 +21546 46 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8577b2bd9f5e9ccd6ac9659ec9a2fb6c9526d4c6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d48be59a85e4a9955cc1203fdd9af2a4c054aca596e325be29b98f771d01f55d +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..626a191695b664050c9176204b9cd6d2702df3b7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56bdb3a1ef953226cd398a880958729301ddac6ae7cd39e5d8d6f4c968786861 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4c212a4b21bb6f190dda3b7534e71bb8f1babc09 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/speech_shape @@ -0,0 +1,2 @@ +280 184832 +286 92672 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..67ed900986d4c099ecdc6c91f57d55dafce06d30 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.21/valid/text_shape @@ -0,0 +1,2 @@ +280 103 +286 48 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22.log b/exp/tts_stats_raw_phn_none/logdir/stats.22.log new file mode 100644 index 0000000000000000000000000000000000000000..ec8720d728a7405341e65e7d504343c0d1bfa40f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.22.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.22.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.22 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:22 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.22.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.22.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.22 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:29,803 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:30,531 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:30,534 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:30,534 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:30,534 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:30,534 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.22/config.yaml +[7850374a3496] 2023-07-13 14:11:30,556 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.22', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.22.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.22.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:34 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.22/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..98468c28edb7dfcd2db01f10bcd668d48f515801 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.22 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.22.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.22.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1de0ea7208f2e145e7570d11116633089a303ce1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1871803616dc9ed14c1886c54f848f261982cde03b0f3459cc14097c278f87 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8d699363e0a41ce501e215f2c51e568725ef6dbd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c75b7cea254a1069d3942f05f8cdd57a15c10086fd542090465113c1cf3b315a +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..01c0b0d9fc2f4e83b51e2192bf792fa04e382f8c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/speech_shape @@ -0,0 +1,43 @@ +21547 128000 +21560 148992 +21567 214016 +21569 124160 +21571 101632 +21575 159232 +21584 103680 +21590 137728 +21591 188160 +21592 148480 +21596 135424 +21597 84480 +21598 126464 +21604 147712 +21605 170240 +21609 121600 +21623 173568 +21624 192256 +21675 184320 +21726 114432 +21740 138496 +21744 138752 +218 152576 +219 92672 +22 86272 +220 132608 +222 171264 +223 130048 +227 129280 +229 115712 +231 162048 +233 187392 +234 147968 +235 129024 +236 127744 +238 223488 +240 142848 +241 139520 +242 107520 +243 128256 +244 127232 +246 161792 +249 152576 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..c50502ef3a27ebab0c5f6f28b73c761bb20964b4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/train/text_shape @@ -0,0 +1,43 @@ +21547 58 +21560 65 +21567 122 +21569 68 +21571 55 +21575 77 +21584 60 +21590 66 +21591 109 +21592 67 +21596 60 +21597 39 +21598 49 +21604 74 +21605 84 +21609 49 +21623 78 +21624 100 +21675 91 +21726 61 +21740 67 +21744 65 +218 84 +219 51 +22 49 +220 72 +222 85 +223 66 +227 68 +229 57 +231 72 +233 94 +234 82 +235 70 +236 66 +238 122 +240 87 +241 70 +242 58 +243 56 +244 64 +246 82 +249 81 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..12e1880911ee46ed26a87d83f0933d7eb101456b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b97af6c566c9258b2ab658665268f3d3407a61d9b4e7043ace081b0459650c7d +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..98ee8486cea902707569acf15d91b329a681deeb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd10d518f0d731ed8557e91cd80d4f62e85b8328e2fd6c0317e29401beb5d151 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3078a91fba60a189f9189ff18aeccd6ee8dfe751 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/speech_shape @@ -0,0 +1,2 @@ +287 146944 +296 109824 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..24b4753a0f4bbcbfd78d0db46f7d55115cc52c54 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.22/valid/text_shape @@ -0,0 +1,2 @@ +287 72 +296 56 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.23/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f33de0faf2411ad4e0ee985e027225e4d2025bf --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.23 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.23.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.23.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bb46c580a98980bde9ac8297b5a231ec880ed277 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a94bff3709c416046762bf3e042c8477ee826e9a87aa960bce060145709b5a +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..da9902aab72e9d863bd3d3520c6be9de5b17da38 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2734e38feaab9f6f97c7a32d6eb163dcb2385dd6e1b1b47e7cf22f8162e15b3e +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..8107cc8734a7492b4db43e6b5d8c4989fda99d3e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/speech_shape @@ -0,0 +1,43 @@ +250 121344 +251 135424 +253 174961 +254 102144 +257 132608 +258 124672 +259 163840 +260 139520 +261 186624 +264 122368 +265 149248 +266 196608 +268 136192 +269 129280 +271 190720 +272 155392 +274 150528 +275 127232 +277 182528 +278 150784 +281 183808 +282 179968 +283 108800 +284 171520 +288 147968 +289 194304 +290 175360 +293 119296 +294 137728 +295 127232 +297 122368 +299 124160 +3 109568 +300 137984 +301 214784 +303 130890 +304 142592 +305 98816 +307 141824 +308 96256 +309 103168 +310 141568 +311 113152 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..d84b6825ae9db77281c6c9c2aa1afbe414909a1d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/train/text_shape @@ -0,0 +1,43 @@ +250 62 +251 68 +253 97 +254 50 +257 69 +258 65 +259 86 +260 70 +261 99 +264 64 +265 82 +266 105 +268 73 +269 65 +271 108 +272 81 +274 81 +275 65 +277 96 +278 72 +281 100 +282 92 +283 57 +284 92 +288 83 +289 109 +290 88 +293 61 +294 60 +295 70 +297 60 +299 70 +3 49 +300 66 +301 122 +303 65 +304 71 +305 54 +307 80 +308 48 +309 54 +310 72 +311 54 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4b64ed2d670ec86978fa1f50e538eca080f6c504 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c3a4fa25d9c8b6551d6a8fb0e0bf6a81c5f2c0587bea4dbf8e4b8fb1ef3808 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bef0e9294f6d4c6b7a2ffa09a1513c05cf413d6b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c72a7c3576fbf5ec5a144420c9c35093ad06c450792a0973240e402e60c62f9 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..5f85236a74128cf9905236c85249c6a0299ae6a4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/speech_shape @@ -0,0 +1,2 @@ +409 143616 +458 153856 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..4205d803d43088e7224dd4df4db410d607ec1ab0 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.23/valid/text_shape @@ -0,0 +1,2 @@ +409 79 +458 72 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.24/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..802ca6357dce494c624286f39810ffc8e74e80f6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.24 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.24.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.24.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..cf26a4252244eae87335e60381e47833f36a2c66 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31f5c162a336add70e0ed6890a4fa3ad5faa78117eda3279fe42133f8e6f5c06 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..670849be5c7009067e59b9e375e880a87874a8ce --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c96d8c2cdd154e17f1c6d882a6fda141c6f0331ce6797c5aa8e42b2d480a60 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..622621c03d3e23a717a11f2d2cbf995f9d499117 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/speech_shape @@ -0,0 +1,43 @@ +312 97536 +313 118272 +314 121600 +315 76288 +317 103680 +318 192512 +32 179456 +320 138496 +322 128512 +323 98560 +325 92672 +328 181248 +33 104192 +331 155136 +333 104192 +336 106240 +337 90368 +338 119040 +339 134144 +34 130048 +340 152320 +341 146688 +342 170752 +345 151552 +346 171008 +348 155904 +35 98816 +351 189696 +354 119296 +356 117248 +357 128000 +358 109824 +36 119296 +360 154368 +361 93184 +362 145664 +363 104448 +364 107264 +365 171520 +366 170240 +367 106752 +37 124416 +370 137216 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..352cdd268e5c6febf2de1cefac16ad96d3351fc3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/train/text_shape @@ -0,0 +1,43 @@ +312 50 +313 66 +314 58 +315 34 +317 52 +318 101 +32 88 +320 72 +322 68 +323 51 +325 41 +328 85 +33 45 +331 88 +333 52 +336 54 +337 54 +338 69 +339 78 +34 62 +340 76 +341 88 +342 92 +345 82 +346 94 +348 70 +35 52 +351 120 +354 60 +356 66 +357 71 +358 58 +36 59 +360 76 +361 54 +362 84 +363 48 +364 63 +365 82 +366 97 +367 65 +37 50 +370 76 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f70eb1bc1abc3bee7e5d363fd6a479e9f0b5b090 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7810cd7103cfa328bc94f1cbaa5431e63dd207752476f4e606f6c1be31893387 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f73bd22319c3ac425e763f3543510b6be52c6672 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dba94a89346218d441eb6ac08be9b2ce8b4167a5032ec2f320ac3290548aaf3 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..d5272cf19f44b216354620b0fbaa4a636deae5a7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/speech_shape @@ -0,0 +1,2 @@ +531 134656 +538 86784 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..9f4c952e5c9aedd5c41bcd25a4632a6051ec48c9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.24/valid/text_shape @@ -0,0 +1,2 @@ +531 68 +538 43 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25.log b/exp/tts_stats_raw_phn_none/logdir/stats.25.log new file mode 100644 index 0000000000000000000000000000000000000000..693abcd1d4cc72a5d793e5a1d6becfd5cf32a56e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.25.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.25.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.25 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:46 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.25.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.25.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.25 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:11:54,495 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:11:55,237 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:11:55,239 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:11:55,240 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:11:55,240 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:11:55,240 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.25/config.yaml +[7850374a3496] 2023-07-13 14:11:55,264 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.25', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.25.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.25.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:11:59 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.25/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7427c6b70f777b21342c8a5d619fa55a0a06639a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.25 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.25.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.25.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..658aedc3aea77e1ab9e75c645bf31faa494cc030 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d39dbe11f10e483138656f07202a0bcb15eddc56934ba8bd271f8218eb82c6 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..fbbe1aca5b4d002345365c02da2650762cccde07 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50dc5201b6c7f8cc89799f05a4da4f47c0bec7538bd1a13275535a1466d72765 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..cb57a4bd312410bd5c0e10a750ffb34f499a1b64 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/speech_shape @@ -0,0 +1,43 @@ +372 126208 +373 199936 +374 185491 +375 111104 +376 116992 +377 108544 +378 177920 +379 142080 +38 162560 +381 148992 +382 141056 +384 130304 +386 175360 +388 113152 +389 152320 +392 229120 +393 160512 +394 109824 +396 102144 +398 145664 +399 184064 +40 133376 +400 158464 +401 131072 +402 180480 +404 114688 +405 123648 +406 118784 +407 137472 +41 107776 +411 122624 +413 133632 +414 106752 +415 123904 +416 139008 +42 125184 +421 122880 +422 104960 +424 165376 +425 203008 +426 147968 +427 99584 +429 162816 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..a1b2fc2db6fb9406ec5b1edae56066e2e4090ba7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/train/text_shape @@ -0,0 +1,43 @@ +372 72 +373 126 +374 110 +375 66 +376 66 +377 69 +378 108 +379 77 +38 73 +381 86 +382 82 +384 69 +386 107 +388 67 +389 67 +392 127 +393 78 +394 57 +396 45 +398 71 +399 89 +40 62 +400 82 +401 65 +402 94 +404 51 +405 54 +406 64 +407 72 +41 53 +411 74 +413 72 +414 48 +415 60 +416 79 +42 64 +421 64 +422 60 +424 86 +425 97 +426 73 +427 51 +429 84 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c6f72e95b0ca017ddd6f6df9367cd8d260c39aad --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a88272bc0d1c0cc634f6c003d424e71589cf6da0530f2d4c8ec066eb4f0fd8ec +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..00dde11b8cd14cea7b1bdd4a3eb667f050c1692b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c823e47ba84afbd5f11ca134244c9064e663bc46c19c69d9085b47e029025496 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..fb26148dc814eabf75bc62e7dd44439c8a665af3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/speech_shape @@ -0,0 +1,2 @@ +539 175104 +540 228608 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..c8944d79eb603366f8e0e63b1f338d2be74c2863 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.25/valid/text_shape @@ -0,0 +1,2 @@ +539 84 +540 112 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.26/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8bd2c4bb9e6ea39b2b9db1dc086b0ed2afa8d45 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.26 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.26.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.26.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f9b0e6ad6ae9c70c40898754c87694359f9f5c60 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dab50dcae3f144e87b73258de98dba2f363a8de83addf3e6c381c0d76b52833 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..fba7774cfd7168f992dcd3053089f87e0015e312 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c0d68b8dd81b3a8d76d7c4b1666f5fbccae5d3de80650902cfbad7518a84211 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..e912b13de7e8508040b667c549d1dcd2e1700e58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/speech_shape @@ -0,0 +1,43 @@ +43 116736 +430 135168 +431 149760 +432 164864 +433 120064 +434 152320 +436 154112 +438 195014 +440 197888 +441 136960 +442 163840 +443 112640 +445 108544 +446 230656 +448 108800 +449 125184 +45 100352 +450 163584 +452 155392 +453 103680 +454 130304 +459 158976 +46 183296 +460 185344 +461 117760 +462 108288 +463 112896 +465 129024 +466 126208 +467 76800 +469 143104 +470 99328 +472 122880 +473 101632 +474 125952 +475 145152 +476 189440 +479 110592 +48 102716 +480 141568 +481 149811 +482 110848 +483 123904 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..0c2c40062b7eb7fdf29d85852518b76ee8510439 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/train/text_shape @@ -0,0 +1,43 @@ +43 61 +430 68 +431 89 +432 89 +433 58 +434 78 +436 82 +438 103 +440 98 +441 72 +442 77 +443 55 +445 52 +446 104 +448 54 +449 69 +45 43 +450 82 +452 84 +453 50 +454 71 +459 82 +46 87 +460 100 +461 53 +462 46 +463 79 +465 79 +466 70 +467 43 +469 68 +470 60 +472 68 +473 50 +474 74 +475 81 +476 96 +479 54 +48 53 +480 85 +481 72 +482 57 +483 50 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..d7c0ba4bd23c30e225876e7f7b48a952922b8b81 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30a4e4fb6b903c660a1b9c1d4bf60e1070ef7e5b3ebfc856dcad515d31b9131 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7b6a70515f401b7f04719ecf14b589cee6879dc5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceaeda48cfe1ee14c9540f2ce4c4006192b0beaa7536927791cd987e8bd0b4a8 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..25533a955539f4919ab6d26f3f0188f86cf68fdb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/speech_shape @@ -0,0 +1,2 @@ +545 121600 +547 169984 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f06b8040af585f02da1e76650cdf699fb88f7d68 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.26/valid/text_shape @@ -0,0 +1,2 @@ +545 57 +547 92 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27.log b/exp/tts_stats_raw_phn_none/logdir/stats.27.log new file mode 100644 index 0000000000000000000000000000000000000000..b91aa7f86f70e3b0263940eef76d2b44b04900e6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.27.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.27.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.27 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:11:59 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.27.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.27.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.27 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:12:06,413 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:12:07,134 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:12:07,137 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:12:07,137 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:12:07,137 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:12:07,138 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.27/config.yaml +[7850374a3496] 2023-07-13 14:12:07,164 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.27', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.27.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.27.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=11 threads=1 +# Ended (code 0) at Thu Jul 13 14:12:10 UTC 2023, elapsed time 11 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.27/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3838b5de8cb011131471044d50baba03f85eab3c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.27 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.27.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.27.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..888a40bc5fcb8d8b5ccb833f321d8a89baf107c1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08bba25beb37c78aa9ddf4beb64e5cf2cadf95d1c0fd3c95a34ffec40540aae3 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..933d6a984e426a34c220e43b826860b449650a9d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f21ae9acffaf6e3ea1d6e7e23d01b4b06e3ec41a03d162a0100380311058b3a +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..10655d52fef1ca0be1ca0cb65cf39e359d4ba0c5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/speech_shape @@ -0,0 +1,43 @@ +484 132096 +485 112384 +487 124928 +488 95744 +489 156416 +49 84736 +490 97984 +492 102144 +493 108544 +495 136448 +496 118272 +497 128256 +499 120064 +5 148480 +50 103168 +500 158720 +501 98304 +502 83200 +503 74752 +505 97792 +506 143872 +507 128768 +509 147456 +51 110848 +510 136192 +513 118272 +515 130048 +518 161024 +519 126464 +52 144128 +522 123648 +523 75264 +524 146688 +525 135680 +528 182016 +530 135424 +534 90624 +535 123392 +536 87552 +537 124672 +552 166400 +557 220928 +558 126464 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..3f7b6b738a5079d2fce1c3cc7e023922f8d0f16d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/train/text_shape @@ -0,0 +1,43 @@ +484 76 +485 64 +487 69 +488 50 +489 81 +49 36 +490 52 +492 60 +493 66 +495 58 +496 66 +497 56 +499 68 +5 65 +50 47 +500 85 +501 61 +502 46 +503 35 +505 54 +506 86 +507 71 +509 82 +51 50 +510 67 +513 57 +515 61 +518 80 +519 60 +52 70 +522 71 +523 46 +524 92 +525 66 +528 93 +530 66 +534 47 +535 57 +536 55 +537 65 +552 91 +557 103 +558 67 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..43ee6c0d94d7792e9597fd8b4c95773a751333f1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5218686bbdeb412716184513852380ba1a04c650e8f16a08cf2408dfe8a1842 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..1fe7301d3bb95a60bc6c45df9ef3bfc0f16c8187 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409d200c3ddea52656ed29f308e906f3c366ef70621114e67478a452172a050a +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..70be9faab7c87c367ba2ad0b1b0494c505dbbc80 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/speech_shape @@ -0,0 +1,2 @@ +549 102144 +551 164608 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..31e7fb94ecf8983d27ba63cbf5d86ee78a0b9789 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.27/valid/text_shape @@ -0,0 +1,2 @@ +549 51 +551 89 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.28/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dac606d64fe5902eaf4e253a917414c4f5930502 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.28 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.28.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.28.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..d898a9ab14c2d3a2fb943b32bf4b3688f6cf98cc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24537b44df82c0ec674afd00e24d3bd0035293514511a8d6de09d196f7575c90 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..9e480ef21d4f7f7f84d5149ef7066d534f20ac3f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e71ed4c7e74be616badbc85a5831f56dd09aec4af4127b9dd36572b1499caef4 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..fb920192121f64a4c40af0640844138a3a2da062 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/speech_shape @@ -0,0 +1,43 @@ +561 152576 +562 170240 +564 99584 +566 116480 +567 155648 +569 154112 +570 160512 +571 131840 +572 134656 +574 103680 +577 155904 +578 153600 +579 121088 +58 78592 +580 127744 +583 101120 +584 84992 +585 95744 +586 125952 +587 127488 +590 104960 +591 159488 +592 163328 +593 93696 +595 111872 +596 136704 +598 304384 +599 235776 +6 184064 +60 156416 +601 145920 +602 115456 +603 163328 +604 124160 +605 162560 +606 182528 +607 109056 +608 96768 +609 130560 +61 83456 +610 152576 +611 196352 +613 120320 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..b4727fec0f1d8662a7882c8b504afbf97827b894 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/train/text_shape @@ -0,0 +1,43 @@ +561 78 +562 98 +564 50 +566 56 +567 82 +569 68 +570 77 +571 64 +572 67 +574 53 +577 73 +578 75 +579 64 +58 47 +580 72 +583 65 +584 52 +585 55 +586 65 +587 72 +590 44 +591 84 +592 78 +593 49 +595 54 +596 77 +598 139 +599 132 +6 98 +60 89 +601 74 +602 56 +603 97 +604 65 +605 88 +606 101 +607 61 +608 53 +609 67 +61 36 +610 100 +611 125 +613 64 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..2cd782d92c2a25cf7fd58aed351489c9893ce780 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e89cb5f1e737e57b1182c8ecf97b986a2e0e93c638e56cdef70cdf6956f494 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..cb37f9baaca118752eff4a8c526bf30b77411181 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd01654ec788d0a004dc6afc79104a862e111fb822729849236d4f055d753949 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..e352e075915df7f79a5f58d026eb8b9bfafbf723 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/speech_shape @@ -0,0 +1,2 @@ +554 107776 +559 238336 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..961cc64bd6c9e6105a3db30fec61a7667e84803b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.28/valid/text_shape @@ -0,0 +1,2 @@ +554 59 +559 132 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29.log b/exp/tts_stats_raw_phn_none/logdir/stats.29.log new file mode 100644 index 0000000000000000000000000000000000000000..9865bcefdf43533ac8e9c2c465a838e290d2486d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.29.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.29.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.29 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:12:10 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.29.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.29.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.29 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:12:19,043 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:12:19,783 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:12:19,786 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:12:19,787 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:12:19,787 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:12:19,787 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.29/config.yaml +[7850374a3496] 2023-07-13 14:12:19,816 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.29', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.29.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.29.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:12:23 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.29/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a757a59cc3e32f581dcc98e0e773b2237382867c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.29 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.29.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.29.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..934ae20873a50504179aa93d7c62c595553d65d7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8612d83a24269fa08bfd35fd90dc72ab321073149cffd7d217bc427c016c8c40 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bf4389506b6a902ff382ad723e70897b5158a2bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925cca7a4ecb1981229e31bf8520e3705caf5b8a03bfbf39f75f2038ac0b1dd7 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..73448a8c05d49a8f0109aa47fa8e25d0ec76c42d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/speech_shape @@ -0,0 +1,43 @@ +614 166656 +617 216064 +619 133632 +62 68608 +620 109824 +621 99840 +622 168704 +623 113920 +624 139264 +625 166912 +626 209920 +627 139008 +629 263936 +63 150784 +630 91136 +631 120320 +632 123392 +633 121344 +636 141056 +639 148480 +640 166400 +641 117248 +643 189519 +644 131201 +645 107776 +646 128512 +647 120832 +648 108544 +649 278016 +65 85504 +650 128768 +651 170752 +655 132864 +656 122880 +658 110152 +659 111872 +660 123136 +661 123392 +665 112384 +666 128220 +667 150784 +668 153856 +669 99584 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..9bd7b8c486d22dc80e9bd830ca85d14bd5037f25 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/train/text_shape @@ -0,0 +1,43 @@ +614 95 +617 108 +619 59 +62 38 +620 55 +621 53 +622 87 +623 50 +624 75 +625 86 +626 102 +627 64 +629 156 +63 81 +630 53 +631 62 +632 64 +633 62 +636 74 +639 82 +640 93 +641 60 +643 117 +644 64 +645 50 +646 66 +647 60 +648 52 +649 138 +65 38 +650 58 +651 91 +655 86 +656 71 +658 56 +659 62 +660 63 +661 67 +665 48 +666 59 +667 79 +668 93 +669 46 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a0f9ed58b46043af27182255de7e0dcacf2c09fd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5b0476419af5308fc418026421d57e7e50d84533b69773776b63fc7a3a5566 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..95c7e0d013cfbf38170248178e1967d9252c68e2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a2861e0cd488e8008cf30eaf34ddbc6471196e22ad56a7021c5a5db43ee39c +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..0f788d5a299d82ee4eede619d43c0171a2aea716 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/speech_shape @@ -0,0 +1,2 @@ +560 162816 +588 187904 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f18a6c3b3a7729d3b57bccda03f003fbaf73fe83 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.29/valid/text_shape @@ -0,0 +1,2 @@ +560 80 +588 101 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.3.log b/exp/tts_stats_raw_phn_none/logdir/stats.3.log new file mode 100644 index 0000000000000000000000000000000000000000..44871558f74d5c4108a3b0ff07db68623b153f04 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.3.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.3 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:27 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.3.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.3.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.3 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:09:35,973 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:09:36,687 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:09:36,690 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:09:36,690 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:09:36,690 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:09:36,690 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.3/config.yaml +[7850374a3496] 2023-07-13 14:09:36,717 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.3', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.3.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.3.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=14 threads=1 +# Ended (code 0) at Thu Jul 13 14:09:41 UTC 2023, elapsed time 14 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.30/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94c5d7bf5deb2d9f249e1e8e78f5ec9aa45f6a1d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.30 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.30.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.30.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..6424dbb550fa1f59882a68631c6b101340027ec1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b51cd0fc3cef092d547143b5e3d3dafec1869e36250d690781bbe787c8032eb +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..d9720c9a6133fa0ed3aa02a2f432f48c401dd98d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f9bdd661820215b3ece20f01b6881051201e6805265da3a62ee37c062b123e5 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..6c717f679df3f438a405fa390d3d05b3154774ef --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/speech_shape @@ -0,0 +1,43 @@ +670 102400 +671 144896 +674 107934 +675 88320 +676 201216 +677 138496 +679 290304 +68 100352 +680 126976 +681 160000 +684 210322 +685 214016 +686 147200 +688 137472 +69 155646 +690 203264 +691 105216 +693 146944 +694 154880 +696 105728 +697 221184 +699 102656 +70 93184 +701 231680 +702 124416 +706 136704 +707 189952 +708 242432 +71 129024 +711 87120 +712 94976 +713 125952 +714 102400 +715 157696 +716 165888 +717 97024 +718 145408 +719 218624 +72 77824 +720 129792 +721 153600 +723 199424 +724 122624 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..6d61957851d9d7f6638bfe1b39fa8ac173966927 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/train/text_shape @@ -0,0 +1,43 @@ +670 56 +671 76 +674 50 +675 42 +676 103 +677 77 +679 179 +68 52 +680 75 +681 90 +684 118 +685 100 +686 86 +688 70 +69 83 +690 97 +691 52 +693 82 +694 83 +696 61 +697 112 +699 57 +70 55 +701 131 +702 70 +706 71 +707 106 +708 144 +71 74 +711 48 +712 50 +713 64 +714 48 +715 85 +716 93 +717 48 +718 69 +719 99 +72 37 +720 78 +721 82 +723 101 +724 59 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..04b5410901878baa0a3f51a5c8dc550a4b1fbf6a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1811823545dfeead621246df2ab4f6ff86b2846a7f3f9fbcbb281101e9bcc0 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..0880bba2ee6278b3afadc78969a22d814807283c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a7ae87e2db3f3e05f3230bcc0610354ec35c4e0a97b522c0da18d107511a83 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..1fb3d61a1de246a7f8c31b6ad57f5c16ac279c64 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/speech_shape @@ -0,0 +1,2 @@ +672 129280 +673 244224 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..a6b7af0291d3fb8ba78b31f4663581e5742a5b80 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.30/valid/text_shape @@ -0,0 +1,2 @@ +672 52 +673 133 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31.log b/exp/tts_stats_raw_phn_none/logdir/stats.31.log new file mode 100644 index 0000000000000000000000000000000000000000..4cc0da27c82d853f8d5651357b815358238e5719 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.31.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.31.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.31 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:12:23 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.31.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.31.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.31 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:12:31,317 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:12:31,991 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:12:31,994 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:12:31,994 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:12:31,994 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:12:31,995 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.31/config.yaml +[7850374a3496] 2023-07-13 14:12:32,021 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.31', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.31.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.31.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=12 threads=1 +# Ended (code 0) at Thu Jul 13 14:12:35 UTC 2023, elapsed time 12 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.31/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d60bc94f90dc6024282bffc26190ed6cb561bf63 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.31 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.31.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.31.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f445453f4f4410c1068a827770c03f83316dc3cd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8dca0bc43ccbfeaa4e3473f794dfc5d3286a8316978844966283dfa0774c2a8 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..559f2af137b8278d4bf19057f4475897e5f5a928 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70fa04cf349a05373f6bded745a8233cdf65af178d3c49f57391616cf0b2b2c7 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..9496a57032cd3857206d4a0108d7c84c5a7b9342 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/speech_shape @@ -0,0 +1,43 @@ +725 102400 +726 185600 +727 144128 +728 163072 +729 104192 +730 130816 +731 142336 +732 117504 +733 168448 +735 151040 +736 105728 +74 126208 +740 183040 +741 162048 +744 190720 +746 148736 +747 119040 +749 137984 +75 178688 +750 131328 +751 129024 +752 191488 +753 135424 +754 141568 +755 146176 +756 113152 +757 180992 +758 147200 +759 120832 +76 168960 +761 155904 +762 92160 +763 199680 +765 89600 +766 225280 +767 125368 +768 195072 +769 167936 +77 158720 +771 152064 +772 115200 +773 120064 +774 162304 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..3e98552cb8eab64c81cd0d26c68e614db82531ea --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/train/text_shape @@ -0,0 +1,43 @@ +725 50 +726 95 +727 76 +728 87 +729 46 +730 72 +731 72 +732 60 +733 88 +735 84 +736 53 +74 54 +740 92 +741 77 +744 100 +746 70 +747 66 +749 72 +75 94 +750 62 +751 67 +752 100 +753 72 +754 67 +755 60 +756 49 +757 110 +758 76 +759 66 +76 80 +761 78 +762 54 +763 133 +765 46 +766 130 +767 63 +768 105 +769 97 +77 65 +771 85 +772 55 +773 65 +774 90 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8a00d3d064b92f1abfccc7627779368a2191f875 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30865abd8031c657570fb9a8813363368733e7a39747c2bb1b74c762c31e5b6a +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..63d79951f0f788897bfeb193147d1a029162ea8b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0696c5fb407e0c7c9470b739b7e61a87241f9c22512a3856a0a3a67bb79a62 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..92652f89ac5b85adb9f2a0fcebfb911edfc75145 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/speech_shape @@ -0,0 +1,2 @@ +678 148224 +698 236544 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..c4cb048484b649c6f0da163ee7e674cbb5c0b72c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.31/valid/text_shape @@ -0,0 +1,2 @@ +678 70 +698 148 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.32/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..606f48e8ad30149c99ee7f7bff4da0c266a0cc91 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.32 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.32.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.32.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..ca35b2960c4d5053ea70a07c4b433b224c322e76 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f88be6225021ee9d8a890bdfde86a2b4108424b984ed0f0acca2f4c1e500b63 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..9fe11db3e707b8d0d2d59e4a20de330e867bf36e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf2ce7906fbb5e8d1f03eb9dcb2d688e34cb6d7fa5e95ba9aeb7fe496c51a5f9 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..78889040fe4bed7a3a86dbe0d219e43695b1a09f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/speech_shape @@ -0,0 +1,43 @@ +775 110336 +776 188416 +777 189184 +78 114176 +780 107008 +781 101888 +782 164864 +783 88064 +784 98816 +785 141824 +786 167936 +787 119296 +788 137472 +79 175360 +790 93440 +791 87552 +792 108800 +793 71168 +795 145408 +798 110592 +799 145152 +8 216576 +80 130542 +802 107008 +803 144896 +804 123904 +805 97792 +810 97024 +811 139008 +812 140800 +813 79360 +82 206848 +83 160000 +86 199936 +87 167680 +90 195840 +91 133888 +93 151552 +94 193280 +95 133120 +96 152320 +98 129792 +99 190976 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..eb476994702bc5251527be6fe05c7a1d8b6b8969 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/train/text_shape @@ -0,0 +1,43 @@ +775 60 +776 109 +777 102 +78 55 +780 56 +781 62 +782 74 +783 41 +784 48 +785 79 +786 92 +787 63 +788 70 +79 94 +790 51 +791 48 +792 59 +793 38 +795 73 +798 57 +799 87 +8 123 +80 70 +802 55 +803 66 +804 73 +805 50 +810 51 +811 79 +812 69 +813 42 +82 121 +83 59 +86 99 +87 77 +90 100 +91 66 +93 76 +94 102 +95 63 +96 78 +98 68 +99 96 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..46fe7eae46210a938bc6c880dd7e54c9d5930f45 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05196e04934152393850816204f5daa4ee868eb77c4d85e8817d4455e6176384 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..955357c2817255cf83e18285529493d7406ea5df --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e342deb297da94e59ccffa8b231de766b91ffb069d9c9fb69c9302f552cc4297 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4328185a2cb6e8c1d0662203dc535a93a3fec2da --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/speech_shape @@ -0,0 +1,2 @@ +739 167168 +808 133376 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..a5b83925d94976225380900860dc255a4c366a46 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.32/valid/text_shape @@ -0,0 +1,2 @@ +739 80 +808 64 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4.log b/exp/tts_stats_raw_phn_none/logdir/stats.4.log new file mode 100644 index 0000000000000000000000000000000000000000..7390f7ef30f6433ef9acfe41794513e2e3c634b4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.4 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:27 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.4.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.4.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.4 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:09:36,215 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:09:36,943 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:09:36,946 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:09:36,946 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:09:36,946 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:09:36,946 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.4/config.yaml +[7850374a3496] 2023-07-13 14:09:36,969 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.4', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.4.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.4.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=14 threads=1 +# Ended (code 0) at Thu Jul 13 14:09:41 UTC 2023, elapsed time 14 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.4/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e417c9bc97c1c6eae51813d1912af99bb3499299 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.4 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.4.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.4.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e4de7c8c89848d6773e773a021a6a1ad01b08c42 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcb3aea31bf7fa28f3f308d4d1e0e6b9b05d3c3f75f5fd01f6dbfe578672490 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e2be422cf14a141d15aeb213424f951a7caef368 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b4150cc55525671e42c64444a91bac751291be1568052c519935fc27fe7ea1 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..cc8efda03921623bfc95a83da258c1fe95473582 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/speech_shape @@ -0,0 +1,44 @@ +15004 172544 +15006 129536 +15010 164864 +15016 192000 +15018 284672 +15019 149760 +15020 148480 +15022 182272 +15025 162816 +15026 211200 +15034 165888 +15042 139264 +15043 261632 +15044 260096 +15086 250624 +151 105472 +15195 192256 +15202 231936 +15207 218368 +15212 303872 +15221 144128 +15225 192000 +15268 166912 +15284 92416 +15285 113920 +153 137472 +15322 160768 +15483 171264 +155 109568 +15501 113152 +15518 118528 +15520 171008 +156 147456 +15637 143616 +15638 171776 +15640 176640 +15658 205568 +15687 157696 +157 135168 +15703 159488 +15706 132096 +15707 142592 +15720 115200 +15721 234752 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..d970e0be8a4ccb92a35729d01e0cb8621882e48b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/train/text_shape @@ -0,0 +1,44 @@ +15004 75 +15006 58 +15010 86 +15016 114 +15018 146 +15019 74 +15020 72 +15022 89 +15025 87 +15026 114 +15034 84 +15042 66 +15043 128 +15044 139 +15086 129 +151 45 +15195 117 +15202 120 +15207 113 +15212 179 +15221 82 +15225 102 +15268 85 +15284 48 +15285 57 +153 62 +15322 87 +15483 90 +155 54 +15501 58 +15518 60 +15520 92 +156 76 +15637 82 +15638 105 +15640 99 +15658 116 +15687 85 +157 73 +15703 98 +15706 75 +15707 75 +15720 46 +15721 134 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..49138a92bd561ae77be74d2b54df9d1e2062d0b6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffca1960701f98e0ffa00f639619de9f8310854f1dac14e7a588e809beb52e61 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..227ce67c732a38eca4ea3d86ac13df29d7cf5181 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d797b76d20aea949e651d6f118c276504fff6cdd3a085afe875cc7097bee5c72 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..7acd9a6e88b8932e4b3db74e1ca4102e0a9aee6a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/speech_shape @@ -0,0 +1,2 @@ +129 144896 +130 189184 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..fc4f839a0bcb9fdf4f230d2b116b47f4e8801bf3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.4/valid/text_shape @@ -0,0 +1,2 @@ +129 68 +130 88 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.5/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec3e73a13d957487074f9785506d328fc0b29c1b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.5 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.5.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.5.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..4b48f891ea08d2d10ba60a97a4756f8107a56e14 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845964ee2329c1966e6a9ab0ff8f80d813b22303872ffdcf927d6e1b5b58db25 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c9b936b897fa9c73f92ba661217def76e721d1bb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f39948212d15d97371955e91d728d35e2934858e65984423f519610b0c75a5f4 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..6886da12a457abca0de89f501f2fcd1a5e8de6f7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/speech_shape @@ -0,0 +1,44 @@ +15763 190720 +158 149504 +15805 211456 +15834 113152 +15861 143360 +15897 109824 +159 99584 +15921 118272 +160 88064 +161 149760 +16124 169984 +16132 163328 +16136 108032 +16202 118016 +16296 92416 +16297 162560 +163 99072 +16324 128512 +16345 138240 +16346 248064 +16347 281344 +16348 169472 +16349 210176 +16350 190464 +16351 202752 +16368 234752 +164 113408 +16407 133888 +16412 120832 +16413 309504 +16414 208128 +166 128000 +167 138496 +16765 126464 +17 132096 +172 113152 +173 103424 +175 123136 +176 140032 +177 177408 +178 199680 +179 167424 +18 94720 +181 151040 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..aec27f53a9ee9372b7b0f2cb34b08876695299e5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/train/text_shape @@ -0,0 +1,44 @@ +15763 112 +158 75 +15805 108 +15834 59 +15861 74 +15897 47 +159 54 +15921 43 +160 49 +161 72 +16124 98 +16132 91 +16136 55 +16202 51 +16296 46 +16297 86 +163 46 +16324 64 +16345 77 +16346 152 +16347 162 +16348 94 +16349 116 +16350 97 +16351 102 +16368 114 +164 62 +16407 68 +16412 60 +16413 208 +16414 129 +166 64 +167 73 +16765 64 +17 66 +172 62 +173 54 +175 66 +176 76 +177 102 +178 132 +179 103 +18 52 +181 86 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..c49c1aa678fea0f6178d99174811f640f55fabe5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353ce11ad78a9107909646c5b6e6660ab4cf61382d9396770a032e450b57c3f6 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..689dc438fb6e388a5414199867a1afe22570e6ef --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f2b360be8bc8399fa945782c49b4631d7773f4f49cc8be85f73e8796b11600 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..e2cf9f2c274c4bbc17d29ab0aa880bc0218100c2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/speech_shape @@ -0,0 +1,2 @@ +131 209920 +14616 165120 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..6613c33fc450bc2ba987e433fb513f7ceee9605d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.5/valid/text_shape @@ -0,0 +1,2 @@ +131 105 +14616 89 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6.log b/exp/tts_stats_raw_phn_none/logdir/stats.6.log new file mode 100644 index 0000000000000000000000000000000000000000..76bf0366b5545c4867005d6b78449a2ed88a7900 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.6 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:41 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.6.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.6.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.6 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:09:48,947 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:09:49,779 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:09:49,782 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:09:49,783 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:09:49,783 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:09:49,783 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.6/config.yaml +[7850374a3496] 2023-07-13 14:09:49,807 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.6', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.6.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.6.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:09:54 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.6/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9cd42e98013e59075149968ab7e2d30a9b463d7f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.6 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.6.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.6.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..bec9ef57f9101ba4e28ec6c680b89b023ff620ac --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:088a91604cd8b1467ebb16f27be89a467a746c7d6dc0b54757e0c9d0fabbab88 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..7ebe494932a2a5b55fc8e81ee256b746a054ef71 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a319a5ed93c1a02255b62459a41afdbe596fc6cb110b657c36f0a4f8cd497460 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..34ca9463c47a0feddd0b0372d402a51c7dc3217c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/speech_shape @@ -0,0 +1,43 @@ +18132 170496 +18133 329216 +18134 329472 +18135 248064 +18136 332032 +18137 313600 +18149 209664 +18150 170496 +18151 179968 +18155 338944 +18156 183808 +18157 211968 +18158 239872 +18159 268288 +18160 268032 +18161 314368 +18162 183040 +18163 178688 +18164 183040 +18166 224256 +18168 289280 +18169 250368 +18171 336640 +18172 242176 +18176 283392 +18177 159488 +18178 119296 +18180 192768 +18181 359424 +18184 237824 +18185 197120 +18187 249600 +18190 268288 +18194 195840 +182 208640 +18215 118528 +18216 195584 +18219 192256 +18220 226560 +18223 165120 +18224 173824 +18225 196864 +18227 188928 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f2d108e4ca5a474f6286016245e9d8e17ab50fae --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/train/text_shape @@ -0,0 +1,43 @@ +18132 92 +18133 175 +18134 194 +18135 140 +18136 191 +18137 172 +18149 120 +18150 88 +18151 104 +18155 182 +18156 90 +18157 107 +18158 122 +18159 147 +18160 152 +18161 169 +18162 98 +18163 106 +18164 112 +18166 136 +18168 156 +18169 163 +18171 204 +18172 122 +18176 155 +18177 102 +18178 77 +18180 106 +18181 203 +18184 122 +18185 111 +18187 147 +18190 148 +18194 116 +182 141 +18215 45 +18216 103 +18219 101 +18220 116 +18223 76 +18224 83 +18225 101 +18227 94 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..e4d4787570c67df4a2d4665b1d4ae181a86fb5c8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b9bfc34c603fd633eccbb9c9ca7f122d379f15db81b1ff9cc6ff71b89bca940 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b74f3f42383e9fa52a5256a417128eb7557f8dd1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08eed39ae1286364a378264451579568bf94eb1b9017f70a73a529770b2e6a6a +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..17dbeb49525578f9efff9c3347e824583be98989 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/speech_shape @@ -0,0 +1,2 @@ +14849 156672 +14891 265472 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..643cdaace2aca217a90dc035f635e476444d2d22 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.6/valid/text_shape @@ -0,0 +1,2 @@ +14849 74 +14891 125 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.7/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..055c5db71833222adebe22b1ba98acecee9ae60b --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.7 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.7.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.7.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..8f21eb6132aa398af85a08ba7b90e2d875a60834 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5e4cccff6220129afb807770617359452a7b69433f77c894c163fdc92ce4dd +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..94a401211a7e3f34cd15de49f424974bd2a97414 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5a7415aaa47d458760dfdc3a452c87a20e4b64a9064708f619d3e625c6d8861 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..7a2ea110bef6064a0137a3bf67ffd30b82b13205 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/speech_shape @@ -0,0 +1,43 @@ +18230 229888 +18231 136448 +18232 259328 +18233 190976 +18234 142592 +18235 116992 +18238 106496 +18239 132608 +18240 202496 +18242 155904 +18244 132096 +18247 98560 +18252 175616 +18296 139008 +183 171776 +18307 166144 +18308 124160 +18326 139264 +18337 154624 +18356 142080 +18375 122880 +18377 123392 +18378 110848 +18380 182528 +18387 153856 +18392 163840 +18394 206848 +184 110592 +18402 150016 +18406 163072 +18410 98816 +18415 148480 +18418 177408 +18423 130560 +18427 217088 +18432 166912 +18438 151040 +18446 157696 +18455 120832 +18458 173312 +18470 158464 +18471 125952 +18475 219648 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..f240fefacef88afd81129895fe7ad4de2024bbd3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/train/text_shape @@ -0,0 +1,43 @@ +18230 118 +18231 67 +18232 158 +18233 95 +18234 69 +18235 66 +18238 48 +18239 66 +18240 98 +18242 74 +18244 64 +18247 43 +18252 95 +18296 65 +183 89 +18307 79 +18308 67 +18326 73 +18337 75 +18356 58 +18375 57 +18377 66 +18378 57 +18380 107 +18387 79 +18392 95 +18394 121 +184 53 +18402 90 +18406 100 +18410 49 +18415 68 +18418 99 +18423 68 +18427 123 +18432 101 +18438 71 +18446 69 +18455 66 +18458 103 +18470 80 +18471 61 +18475 118 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..080204413130dde6208e5f2deb45ae9d3bf24395 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63577fcdf417a42c9b5e0825f1f4fcc0ca4f57e426a9c36a01b08df1a516b6ef +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..41c353ce0a123b581c11b3ecc326c18fb844314a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a94f3243b3fb87deaa3930cc5c3fe91f440d36f711bc45f94afb23f7dcde93 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..26f7a402635c7bb26f1c357e0ca2eeacb083daa0 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/speech_shape @@ -0,0 +1,2 @@ +14941 166144 +14991 157696 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..437591076547160e7734f733fd6cc79a2a76adbd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.7/valid/text_shape @@ -0,0 +1,2 @@ +14941 88 +14991 96 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8.log b/exp/tts_stats_raw_phn_none/logdir/stats.8.log new file mode 100644 index 0000000000000000000000000000000000000000..a7b8e383deba552af375a8ba6107850a1e8725a3 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.8 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:09:54 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.8.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.8.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.8 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:02,180 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:02,848 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:02,851 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:02,851 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:02,851 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:02,852 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.8/config.yaml +[7850374a3496] 2023-07-13 14:10:02,876 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.8', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.8.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.8.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:07 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.8/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2b8c9420e0dddfd8c748370ab4cb9a5e90e76ea --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.8 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.8.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.8.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..cd23b2424eb7eee02fa811e172ed33d4a6978cd6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07781a532d3c754cff8a7ccbcdaf4348594e82f503d480bb7aabd41918a135d5 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..56b7c48b4f7a0d4b84dba12a84ea528fbd8499fc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea1849cd3aa0024a44d54932101725f689127cae67f897a12d3f386de5b3887 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..9206571314df7612a38332005beffb1f51137d8c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/speech_shape @@ -0,0 +1,43 @@ +18482 144128 +18483 169984 +18497 119552 +18498 128512 +185 209152 +18503 192000 +18509 153856 +18512 188672 +18535 187648 +18538 127488 +18541 132352 +18543 120320 +18546 175616 +18548 139264 +18552 187648 +18553 159488 +18559 151296 +18562 115200 +18564 153088 +18567 193536 +18568 139520 +18569 176640 +18570 135424 +18571 204288 +18578 133376 +18579 118528 +18580 96512 +18582 186624 +18584 151552 +18590 177664 +186 178432 +18601 130560 +18606 187904 +18610 115712 +18624 213760 +18639 189184 +18641 125440 +18642 175104 +18646 124672 +18655 138240 +18658 138240 +18670 125952 +18680 197120 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..fa5aebe2fc1abfed1cb110ebde0d1eb901437f84 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/train/text_shape @@ -0,0 +1,43 @@ +18482 73 +18483 95 +18497 60 +18498 52 +185 120 +18503 101 +18509 81 +18512 104 +18535 91 +18538 62 +18541 34 +18543 62 +18546 89 +18548 66 +18552 104 +18553 68 +18559 82 +18562 40 +18564 79 +18567 94 +18568 65 +18569 93 +18570 58 +18571 103 +18578 72 +18579 65 +18580 45 +18582 108 +18584 67 +18590 92 +186 83 +18601 73 +18606 107 +18610 52 +18624 121 +18639 114 +18641 62 +18642 97 +18646 67 +18655 64 +18658 80 +18670 62 +18680 117 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..de89354fb61d6955854b27995f442e13f736dac2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e5fb0ba64944b85f5116a1c94040c552b09d247a3c142225d996a416b8f06f +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..f2bde674d8ffe4f54089b1b2efd2fcae223d45de --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ab2cc87abe7433185736998e8c79f005dc9a2af35f358cff109d5066e99958e +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..6a0de9202a4d1cf8af6b1dc686be26ca0e737262 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/speech_shape @@ -0,0 +1,2 @@ +15003 125184 +15079 148992 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..6653088068969d3366b5d36db41e00bd88eceefb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.8/valid/text_shape @@ -0,0 +1,2 @@ +15003 69 +15079 86 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9.log b/exp/tts_stats_raw_phn_none/logdir/stats.9.log new file mode 100644 index 0000000000000000000000000000000000000000..0d6e72d1552e35051cab20b0f5798855ca11c09c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9.log @@ -0,0 +1,116 @@ +# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.9.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.9.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.9 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +# Started at Thu Jul 13 14:10:06 UTC 2023 +# +/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5 + warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" +/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.9.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.9.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.9 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null +[7850374a3496] 2023-07-13 14:10:14,905 (tts:293) INFO: Vocabulary size: 79 +[7850374a3496] 2023-07-13 14:10:15,617 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True +[7850374a3496] 2023-07-13 14:10:15,620 (abs_task:1204) INFO: Model structure: +ESPnetTTSModel( + (feats_extract): LogMelFbank( + (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True) + (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False) + ) + (tts): Tacotron2( + (enc): Encoder( + (embed): Embedding(79, 512, padding_idx=0) + (convs): ModuleList( + (0-2): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): ReLU() + (3): Dropout(p=0.5, inplace=False) + ) + ) + (blstm): LSTM(512, 256, batch_first=True, bidirectional=True) + ) + (dec): Decoder( + (att): AttLoc( + (mlp_enc): Linear(in_features=512, out_features=512, bias=True) + (mlp_dec): Linear(in_features=1024, out_features=512, bias=False) + (mlp_att): Linear(in_features=32, out_features=512, bias=False) + (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False) + (gvec): Linear(in_features=512, out_features=1, bias=True) + ) + (lstm): ModuleList( + (0): ZoneOutCell( + (cell): LSTMCell(768, 1024) + ) + (1): ZoneOutCell( + (cell): LSTMCell(1024, 1024) + ) + ) + (prenet): Prenet( + (prenet): ModuleList( + (0): Sequential( + (0): Linear(in_features=80, out_features=256, bias=True) + (1): ReLU() + ) + (1): Sequential( + (0): Linear(in_features=256, out_features=256, bias=True) + (1): ReLU() + ) + ) + ) + (postnet): Postnet( + (postnet): ModuleList( + (0): Sequential( + (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (1-3): 3 x Sequential( + (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Tanh() + (3): Dropout(p=0.5, inplace=False) + ) + (4): Sequential( + (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) + (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (2): Dropout(p=0.5, inplace=False) + ) + ) + ) + (feat_out): Linear(in_features=1536, out_features=240, bias=False) + (prob_out): Linear(in_features=1536, out_features=3, bias=True) + ) + (taco2_loss): Tacotron2Loss( + (l1_criterion): L1Loss() + (mse_criterion): MSELoss() + (bce_criterion): BCEWithLogitsLoss() + ) + (attn_loss): GuidedAttentionLoss() + ) +) + +Model summary: + Class Name: ESPnetTTSModel + Total Number of model parameters: 26.91 M + Number of trainable parameters: 26.91 M (100.0%) + Size: 107.63 MB + Type: torch.float32 +[7850374a3496] 2023-07-13 14:10:15,621 (abs_task:1207) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + differentiable: False + eps: 1e-06 + foreach: None + fused: None + lr: 0.001 + maximize: False + weight_decay: 0.0 +) +[7850374a3496] 2023-07-13 14:10:15,621 (abs_task:1208) INFO: Scheduler: None +[7850374a3496] 2023-07-13 14:10:15,621 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.9/config.yaml +[7850374a3496] 2023-07-13 14:10:15,651 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.9', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.9.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.9.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['', '', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', ''], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False) +/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error. +Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.) + return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined] +# Accounting: time=13 threads=1 +# Ended (code 0) at Thu Jul 13 14:10:19 UTC 2023, elapsed time 13 seconds diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/config.yaml b/exp/tts_stats_raw_phn_none/logdir/stats.9/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2ec273edaa6d2bc7dc016aa110b6c638d62ad700 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/config.yaml @@ -0,0 +1,267 @@ +config: conf/tuning/finetune_tacotron2.yaml +print_config: false +log_level: INFO +dry_run: false +iterator_type: sequence +output_dir: exp/tts_stats_raw_phn_none/logdir/stats.9 +ngpu: 0 +seed: 0 +num_workers: 1 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: null +dist_rank: null +local_rank: null +dist_master_addr: null +dist_master_port: null +dist_launcher: null +multiprocessing_distributed: false +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: true +write_collected_feats: false +max_epoch: 120 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - loss + - min +- - train + - loss + - min +keep_nbest_models: 5 +nbest_averaging_interval: 0 +grad_clip: 1.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 1 +no_forward_run: false +resume: false +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: 200 +batch_size: 20 +valid_batch_size: null +batch_bins: 1600000 +valid_batch_bins: null +train_shape_file: +- exp/tts_stats_raw_phn_none/logdir/train.9.scp +valid_shape_file: +- exp/tts_stats_raw_phn_none/logdir/valid.9.scp +batch_type: numel +valid_batch_type: null +fold_length: [] +sort_in_batch: descending +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train/text + - text + - text +- - dump/raw/train/wav.scp + - speech + - sound +valid_data_path_and_name_and_type: +- - dump/raw/dev/text + - text + - text +- - dump/raw/dev/wav.scp + - speech + - sound +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.001 + eps: 1.0e-06 + weight_decay: 0.0 +scheduler: null +scheduler_conf: {} +token_list: +- +- +- a +- sil +- l +- aa +- m +- ii0 +- t +- < +- n +- r +- E +- i0 +- b +- uu0 +- f +- i1 +- k +- w +- A +- s +- y +- d +- q +- h +- H +- $ +- u0 +- AA +- j +- T +- x +- S +- z +- ll +- I1 +- D +- II0 +- g +- tt +- rr +- I0 +- UU0 +- dd +- u1 +- U0 +- mm +- nn +- '*' +- $$ +- bb +- yy +- ss +- jj +- ww +- ^ +- SS +- TT +- Z +- zz +- kk +- U1 +- HH +- ff +- qq +- xx +- ^^ +- DD +- hh +- EE +- ZZ +- '**' +- aaaa +- ssss +- v +- uu1 +- jjjj +- +odim: null +model_conf: {} +use_preprocessor: true +token_type: phn +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +feats_extract: fbank +feats_extract_conf: + n_fft: 1024 + hop_length: 256 + win_length: null + fs: 22050 + fmin: 80 + fmax: 7600 + n_mels: 80 +normalize: null +normalize_conf: {} +tts: tacotron2 +tts_conf: + embed_dim: 512 + elayers: 1 + eunits: 512 + econv_layers: 3 + econv_chans: 512 + econv_filts: 5 + atype: location + adim: 512 + aconv_chans: 32 + aconv_filts: 15 + cumulate_att_w: true + dlayers: 2 + dunits: 1024 + prenet_layers: 2 + prenet_units: 256 + postnet_layers: 5 + postnet_chans: 512 + postnet_filts: 5 + output_activation: null + use_batch_norm: true + use_concate: true + use_residual: false + dropout_rate: 0.5 + zoneout_rate: 0.1 + reduction_factor: 3 + spk_embed_dim: null + use_masking: true + bce_pos_weight: 20.0 + use_guided_attn_loss: true + guided_attn_loss_sigma: 0.4 + guided_attn_loss_lambda: 1.0 +pitch_extract: null +pitch_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + f0max: 400 + f0min: 80 +pitch_normalize: null +pitch_normalize_conf: {} +energy_extract: null +energy_extract_conf: + fs: 22050 + n_fft: 1024 + hop_length: 256 + win_length: null +energy_normalize: null +energy_normalize_conf: {} +required: +- output_dir +- token_list +version: '202304' +distributed: false diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..d379110975cee22354bd875128a86c90a0c6178a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e570246d80885446e56de4819e8928bb0ccb8229e69f759dbea211b8260757 +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..fdf3b9562eacc5ea519a5db4f924a4d1ca38006c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b9e38955e4a1d54c44825203422e7fcf74c6381b16b914c474b4569db1867b7 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..6634f5992bae23d3d2d6d9e743640408a1146960 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/speech_shape @@ -0,0 +1,43 @@ +18681 195328 +18686 142336 +187 171264 +18706 269824 +18707 189696 +18709 145152 +18711 114688 +18720 132352 +18747 200192 +18755 96256 +18758 217344 +18760 128000 +18768 193024 +18769 105728 +18772 164608 +18786 253952 +18796 185269 +18804 201984 +18813 178176 +18835 184832 +18837 192512 +18838 208384 +18841 211968 +18851 199936 +18858 146688 +18861 149504 +18862 215296 +18867 199168 +18870 221952 +18873 191744 +18886 203264 +18893 230144 +18899 134912 +189 160768 +18904 183808 +18906 147712 +18908 136448 +18909 156160 +18910 211968 +18912 167680 +18915 148480 +18919 197888 +18934 191232 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/train/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..fc80aaf53dc136e460ff7319c99f906e5705f51a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/train/text_shape @@ -0,0 +1,43 @@ +18681 117 +18686 77 +187 100 +18706 166 +18707 107 +18709 81 +18711 68 +18720 65 +18747 109 +18755 49 +18758 106 +18760 67 +18768 119 +18769 61 +18772 72 +18786 104 +18796 68 +18804 66 +18813 70 +18835 112 +18837 104 +18838 122 +18841 111 +18851 113 +18858 103 +18861 67 +18862 120 +18867 113 +18870 138 +18873 103 +18886 121 +18893 119 +18899 86 +189 92 +18904 90 +18906 82 +18908 59 +18909 76 +18910 100 +18912 87 +18915 72 +18919 107 +18934 103 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/batch_keys b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/batch_keys new file mode 100644 index 0000000000000000000000000000000000000000..f92e0c9a1be958877dbbbe31c6b40111bcb7ae58 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/batch_keys @@ -0,0 +1,2 @@ +text +speech diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..791412536e206f5f83e8c1863404b2d553bb8cec --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c5cf3fa3e7ecbd730cd31346cd22e72b56962ab034dd7a67bfee748d41cf2f +size 778 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..24c030c04c918df8dee6628f76c764d86e4bc81c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da81f5a3dc2535eea270b5e2d451a922ff028c5a25a8a02f8e520477ea3524de +size 1402 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/speech_shape b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..5d9a5782ae8ca96762f317d8ff565eae7d50d62c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/speech_shape @@ -0,0 +1,2 @@ +15269 138496 +15665 165376 diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/stats_keys b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/stats_keys new file mode 100644 index 0000000000000000000000000000000000000000..8b395786dbc0bf6d417ffb91823949cfaf8116bc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/stats_keys @@ -0,0 +1,2 @@ +feats +feats_lengths diff --git a/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/text_shape b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..5b4c3ff01da533d11388d68da13193ad96a203bd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/stats.9/valid/text_shape @@ -0,0 +1,2 @@ +15269 59 +15665 96 diff --git a/exp/tts_stats_raw_phn_none/logdir/train.1.scp b/exp/tts_stats_raw_phn_none/logdir/train.1.scp new file mode 100644 index 0000000000000000000000000000000000000000..e1915441dfc247d7eff5b2fab9e4a2e73e43054a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.1.scp @@ -0,0 +1,44 @@ +1 downloads/dataset-tun/wav/0001.wav +100 downloads/dataset-tun/wav/0100.wav +101 downloads/dataset-tun/wav/0101.wav +103 downloads/dataset-tun/wav/0103.wav +104 downloads/dataset-tun/wav/0104.wav +105 downloads/dataset-tun/wav/0105.wav +106 downloads/dataset-tun/wav/0106.wav +108 downloads/dataset-tun/wav/0108.wav +109 downloads/dataset-tun/wav/0109.wav +110 downloads/dataset-tun/wav/0110.wav +111 downloads/dataset-tun/wav/0111.wav +112 downloads/dataset-tun/wav/0112.wav +113 downloads/dataset-tun/wav/0113.wav +114 downloads/dataset-tun/wav/0114.wav +116 downloads/dataset-tun/wav/0116.wav +117 downloads/dataset-tun/wav/0117.wav +118 downloads/dataset-tun/wav/0118.wav +124 downloads/dataset-tun/wav/0124.wav +127 downloads/dataset-tun/wav/0127.wav +13 downloads/dataset-tun/wav/0013.wav +132 downloads/dataset-tun/wav/0132.wav +133 downloads/dataset-tun/wav/0133.wav +135 downloads/dataset-tun/wav/0135.wav +136 downloads/dataset-tun/wav/0136.wav +137 downloads/dataset-tun/wav/0137.wav +139 downloads/dataset-tun/wav/0139.wav +14 downloads/dataset-tun/wav/0014.wav +141 downloads/dataset-tun/wav/0141.wav +142 downloads/dataset-tun/wav/0142.wav +143 downloads/dataset-tun/wav/0143.wav +144 downloads/dataset-tun/wav/0144.wav +145 downloads/dataset-tun/wav/0145.wav +14502 downloads/dataset-tun/wav/14502.wav +14507 downloads/dataset-tun/wav/14507.wav +14520 downloads/dataset-tun/wav/14520.wav +14522 downloads/dataset-tun/wav/14522.wav +14523 downloads/dataset-tun/wav/14523.wav +14532 downloads/dataset-tun/wav/14532.wav +14538 downloads/dataset-tun/wav/14538.wav +14540 downloads/dataset-tun/wav/14540.wav +14551 downloads/dataset-tun/wav/14551.wav +146 downloads/dataset-tun/wav/0146.wav +14602 downloads/dataset-tun/wav/14602.wav +14612 downloads/dataset-tun/wav/14612.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.10.scp b/exp/tts_stats_raw_phn_none/logdir/train.10.scp new file mode 100644 index 0000000000000000000000000000000000000000..4a4fc7563cc9bfb29682c6dc4c558fbea6c79b43 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.10.scp @@ -0,0 +1,43 @@ +18935 downloads/dataset-tun/wav/18935.wav +18936 downloads/dataset-tun/wav/18936.wav +18943 downloads/dataset-tun/wav/18943.wav +18944 downloads/dataset-tun/wav/18944.wav +18947 downloads/dataset-tun/wav/18947.wav +18951 downloads/dataset-tun/wav/18951.wav +18955 downloads/dataset-tun/wav/18955.wav +18959 downloads/dataset-tun/wav/18959.wav +18964 downloads/dataset-tun/wav/18964.wav +18982 downloads/dataset-tun/wav/18982.wav +18989 downloads/dataset-tun/wav/18989.wav +18991 downloads/dataset-tun/wav/18991.wav +18993 downloads/dataset-tun/wav/18993.wav +18997 downloads/dataset-tun/wav/18997.wav +19 downloads/dataset-tun/wav/0019.wav +19001 downloads/dataset-tun/wav/19001.wav +19005 downloads/dataset-tun/wav/19005.wav +19010 downloads/dataset-tun/wav/19010.wav +19011 downloads/dataset-tun/wav/19011.wav +19015 downloads/dataset-tun/wav/19015.wav +19024 downloads/dataset-tun/wav/19024.wav +19028 downloads/dataset-tun/wav/19028.wav +19063 downloads/dataset-tun/wav/19063.wav +19065 downloads/dataset-tun/wav/19065.wav +19067 downloads/dataset-tun/wav/19067.wav +19075 downloads/dataset-tun/wav/19075.wav +19076 downloads/dataset-tun/wav/19076.wav +19090 downloads/dataset-tun/wav/19090.wav +19091 downloads/dataset-tun/wav/19091.wav +19095 downloads/dataset-tun/wav/19095.wav +19096 downloads/dataset-tun/wav/19096.wav +19099 downloads/dataset-tun/wav/19099.wav +191 downloads/dataset-tun/wav/0191.wav +19103 downloads/dataset-tun/wav/19103.wav +19109 downloads/dataset-tun/wav/19109.wav +19111 downloads/dataset-tun/wav/19111.wav +19113 downloads/dataset-tun/wav/19113.wav +19116 downloads/dataset-tun/wav/19116.wav +19118 downloads/dataset-tun/wav/19118.wav +19121 downloads/dataset-tun/wav/19121.wav +19122 downloads/dataset-tun/wav/19122.wav +19132 downloads/dataset-tun/wav/19132.wav +19138 downloads/dataset-tun/wav/19138.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.11.scp b/exp/tts_stats_raw_phn_none/logdir/train.11.scp new file mode 100644 index 0000000000000000000000000000000000000000..6e2ed06dcecba9f2c4479ce8bbdd6e041b045a35 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.11.scp @@ -0,0 +1,43 @@ +19141 downloads/dataset-tun/wav/19141.wav +19142 downloads/dataset-tun/wav/19142.wav +19157 downloads/dataset-tun/wav/19157.wav +19160 downloads/dataset-tun/wav/19160.wav +19163 downloads/dataset-tun/wav/19163.wav +19165 downloads/dataset-tun/wav/19165.wav +19177 downloads/dataset-tun/wav/19177.wav +19180 downloads/dataset-tun/wav/19180.wav +19181 downloads/dataset-tun/wav/19181.wav +19194 downloads/dataset-tun/wav/19194.wav +19197 downloads/dataset-tun/wav/19197.wav +192 downloads/dataset-tun/wav/0192.wav +19201 downloads/dataset-tun/wav/19201.wav +19211 downloads/dataset-tun/wav/19211.wav +19212 downloads/dataset-tun/wav/19212.wav +19213 downloads/dataset-tun/wav/19213.wav +19218 downloads/dataset-tun/wav/19218.wav +19224 downloads/dataset-tun/wav/19224.wav +19225 downloads/dataset-tun/wav/19225.wav +19229 downloads/dataset-tun/wav/19229.wav +19234 downloads/dataset-tun/wav/19234.wav +19237 downloads/dataset-tun/wav/19237.wav +19241 downloads/dataset-tun/wav/19241.wav +19251 downloads/dataset-tun/wav/19251.wav +19263 downloads/dataset-tun/wav/19263.wav +19267 downloads/dataset-tun/wav/19267.wav +19271 downloads/dataset-tun/wav/19271.wav +19276 downloads/dataset-tun/wav/19276.wav +19280 downloads/dataset-tun/wav/19280.wav +19281 downloads/dataset-tun/wav/19281.wav +19295 downloads/dataset-tun/wav/19295.wav +19298 downloads/dataset-tun/wav/19298.wav +19304 downloads/dataset-tun/wav/19304.wav +19310 downloads/dataset-tun/wav/19310.wav +19316 downloads/dataset-tun/wav/19316.wav +19321 downloads/dataset-tun/wav/19321.wav +19325 downloads/dataset-tun/wav/19325.wav +19327 downloads/dataset-tun/wav/19327.wav +19333 downloads/dataset-tun/wav/19333.wav +19337 downloads/dataset-tun/wav/19337.wav +19347 downloads/dataset-tun/wav/19347.wav +19348 downloads/dataset-tun/wav/19348.wav +19357 downloads/dataset-tun/wav/19357.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.12.scp b/exp/tts_stats_raw_phn_none/logdir/train.12.scp new file mode 100644 index 0000000000000000000000000000000000000000..3fa23a36d7b0b5e50e7d3ef50a5b38dd6d1a91e2 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.12.scp @@ -0,0 +1,43 @@ +19360 downloads/dataset-tun/wav/19360.wav +19366 downloads/dataset-tun/wav/19366.wav +19367 downloads/dataset-tun/wav/19367.wav +19371 downloads/dataset-tun/wav/19371.wav +19372 downloads/dataset-tun/wav/19372.wav +19374 downloads/dataset-tun/wav/19374.wav +19376 downloads/dataset-tun/wav/19376.wav +19387 downloads/dataset-tun/wav/19387.wav +19396 downloads/dataset-tun/wav/19396.wav +19399 downloads/dataset-tun/wav/19399.wav +194 downloads/dataset-tun/wav/0194.wav +19400 downloads/dataset-tun/wav/19400.wav +19404 downloads/dataset-tun/wav/19404.wav +19406 downloads/dataset-tun/wav/19406.wav +19410 downloads/dataset-tun/wav/19410.wav +19413 downloads/dataset-tun/wav/19413.wav +19414 downloads/dataset-tun/wav/19414.wav +19423 downloads/dataset-tun/wav/19423.wav +19429 downloads/dataset-tun/wav/19429.wav +19439 downloads/dataset-tun/wav/19439.wav +19440 downloads/dataset-tun/wav/19440.wav +19449 downloads/dataset-tun/wav/19449.wav +19451 downloads/dataset-tun/wav/19451.wav +19454 downloads/dataset-tun/wav/19454.wav +19477 downloads/dataset-tun/wav/19477.wav +19482 downloads/dataset-tun/wav/19482.wav +19488 downloads/dataset-tun/wav/19488.wav +19496 downloads/dataset-tun/wav/19496.wav +19499 downloads/dataset-tun/wav/19499.wav +195 downloads/dataset-tun/wav/0195.wav +19501 downloads/dataset-tun/wav/19501.wav +19506 downloads/dataset-tun/wav/19506.wav +19509 downloads/dataset-tun/wav/19509.wav +19510 downloads/dataset-tun/wav/19510.wav +19511 downloads/dataset-tun/wav/19511.wav +19521 downloads/dataset-tun/wav/19521.wav +19522 downloads/dataset-tun/wav/19522.wav +19524 downloads/dataset-tun/wav/19524.wav +19529 downloads/dataset-tun/wav/19529.wav +19540 downloads/dataset-tun/wav/19540.wav +19542 downloads/dataset-tun/wav/19542.wav +19543 downloads/dataset-tun/wav/19543.wav +19548 downloads/dataset-tun/wav/19548.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.13.scp b/exp/tts_stats_raw_phn_none/logdir/train.13.scp new file mode 100644 index 0000000000000000000000000000000000000000..303242552067df81255eae5a431ee73a1113cc56 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.13.scp @@ -0,0 +1,43 @@ +19550 downloads/dataset-tun/wav/19550.wav +19551 downloads/dataset-tun/wav/19551.wav +19554 downloads/dataset-tun/wav/19554.wav +19556 downloads/dataset-tun/wav/19556.wav +19558 downloads/dataset-tun/wav/19558.wav +19565 downloads/dataset-tun/wav/19565.wav +19569 downloads/dataset-tun/wav/19569.wav +19576 downloads/dataset-tun/wav/19576.wav +19581 downloads/dataset-tun/wav/19581.wav +19584 downloads/dataset-tun/wav/19584.wav +19585 downloads/dataset-tun/wav/19585.wav +19587 downloads/dataset-tun/wav/19587.wav +19590 downloads/dataset-tun/wav/19590.wav +19595 downloads/dataset-tun/wav/19595.wav +19598 downloads/dataset-tun/wav/19598.wav +196 downloads/dataset-tun/wav/0196.wav +19601 downloads/dataset-tun/wav/19601.wav +19604 downloads/dataset-tun/wav/19604.wav +19605 downloads/dataset-tun/wav/19605.wav +19608 downloads/dataset-tun/wav/19608.wav +19611 downloads/dataset-tun/wav/19611.wav +19612 downloads/dataset-tun/wav/19612.wav +19621 downloads/dataset-tun/wav/19621.wav +19622 downloads/dataset-tun/wav/19622.wav +19627 downloads/dataset-tun/wav/19627.wav +19631 downloads/dataset-tun/wav/19631.wav +19635 downloads/dataset-tun/wav/19635.wav +19638 downloads/dataset-tun/wav/19638.wav +19651 downloads/dataset-tun/wav/19651.wav +19658 downloads/dataset-tun/wav/19658.wav +19659 downloads/dataset-tun/wav/19659.wav +19673 downloads/dataset-tun/wav/19673.wav +19676 downloads/dataset-tun/wav/19676.wav +19683 downloads/dataset-tun/wav/19683.wav +19684 downloads/dataset-tun/wav/19684.wav +19688 downloads/dataset-tun/wav/19688.wav +19692 downloads/dataset-tun/wav/19692.wav +19695 downloads/dataset-tun/wav/19695.wav +197 downloads/dataset-tun/wav/0197.wav +19700 downloads/dataset-tun/wav/19700.wav +19702 downloads/dataset-tun/wav/19702.wav +19703 downloads/dataset-tun/wav/19703.wav +19705 downloads/dataset-tun/wav/19705.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.14.scp b/exp/tts_stats_raw_phn_none/logdir/train.14.scp new file mode 100644 index 0000000000000000000000000000000000000000..df8557f660d7677d3c777897972267dc8bd320eb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.14.scp @@ -0,0 +1,43 @@ +19713 downloads/dataset-tun/wav/19713.wav +19714 downloads/dataset-tun/wav/19714.wav +19723 downloads/dataset-tun/wav/19723.wav +19726 downloads/dataset-tun/wav/19726.wav +19748 downloads/dataset-tun/wav/19748.wav +19749 downloads/dataset-tun/wav/19749.wav +19753 downloads/dataset-tun/wav/19753.wav +19760 downloads/dataset-tun/wav/19760.wav +19763 downloads/dataset-tun/wav/19763.wav +19764 downloads/dataset-tun/wav/19764.wav +19775 downloads/dataset-tun/wav/19775.wav +19778 downloads/dataset-tun/wav/19778.wav +19782 downloads/dataset-tun/wav/19782.wav +19783 downloads/dataset-tun/wav/19783.wav +19785 downloads/dataset-tun/wav/19785.wav +19787 downloads/dataset-tun/wav/19787.wav +19789 downloads/dataset-tun/wav/19789.wav +19791 downloads/dataset-tun/wav/19791.wav +19797 downloads/dataset-tun/wav/19797.wav +19798 downloads/dataset-tun/wav/19798.wav +198 downloads/dataset-tun/wav/0198.wav +19801 downloads/dataset-tun/wav/19801.wav +19812 downloads/dataset-tun/wav/19812.wav +19826 downloads/dataset-tun/wav/19826.wav +19851 downloads/dataset-tun/wav/19851.wav +19854 downloads/dataset-tun/wav/19854.wav +19855 downloads/dataset-tun/wav/19855.wav +19864 downloads/dataset-tun/wav/19864.wav +19874 downloads/dataset-tun/wav/19874.wav +19875 downloads/dataset-tun/wav/19875.wav +19877 downloads/dataset-tun/wav/19877.wav +19884 downloads/dataset-tun/wav/19884.wav +19888 downloads/dataset-tun/wav/19888.wav +19889 downloads/dataset-tun/wav/19889.wav +199 downloads/dataset-tun/wav/0199.wav +19902 downloads/dataset-tun/wav/19902.wav +19903 downloads/dataset-tun/wav/19903.wav +19907 downloads/dataset-tun/wav/19907.wav +19913 downloads/dataset-tun/wav/19913.wav +19917 downloads/dataset-tun/wav/19917.wav +19918 downloads/dataset-tun/wav/19918.wav +19920 downloads/dataset-tun/wav/19920.wav +19926 downloads/dataset-tun/wav/19926.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.17.scp b/exp/tts_stats_raw_phn_none/logdir/train.17.scp new file mode 100644 index 0000000000000000000000000000000000000000..c86d6e077593750be784ffcfcdac1d8708422e3f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.17.scp @@ -0,0 +1,43 @@ +20396 downloads/dataset-tun/wav/20396.wav +20397 downloads/dataset-tun/wav/20397.wav +20399 downloads/dataset-tun/wav/20399.wav +204 downloads/dataset-tun/wav/0204.wav +20402 downloads/dataset-tun/wav/20402.wav +20406 downloads/dataset-tun/wav/20406.wav +20408 downloads/dataset-tun/wav/20408.wav +20410 downloads/dataset-tun/wav/20410.wav +20413 downloads/dataset-tun/wav/20413.wav +20422 downloads/dataset-tun/wav/20422.wav +20427 downloads/dataset-tun/wav/20427.wav +20435 downloads/dataset-tun/wav/20435.wav +20438 downloads/dataset-tun/wav/20438.wav +20440 downloads/dataset-tun/wav/20440.wav +20442 downloads/dataset-tun/wav/20442.wav +20445 downloads/dataset-tun/wav/20445.wav +20447 downloads/dataset-tun/wav/20447.wav +20461 downloads/dataset-tun/wav/20461.wav +20464 downloads/dataset-tun/wav/20464.wav +20465 downloads/dataset-tun/wav/20465.wav +20482 downloads/dataset-tun/wav/20482.wav +20484 downloads/dataset-tun/wav/20484.wav +20488 downloads/dataset-tun/wav/20488.wav +20489 downloads/dataset-tun/wav/20489.wav +20495 downloads/dataset-tun/wav/20495.wav +20496 downloads/dataset-tun/wav/20496.wav +205 downloads/dataset-tun/wav/0205.wav +20503 downloads/dataset-tun/wav/20503.wav +20504 downloads/dataset-tun/wav/20504.wav +20512 downloads/dataset-tun/wav/20512.wav +20513 downloads/dataset-tun/wav/20513.wav +20516 downloads/dataset-tun/wav/20516.wav +20517 downloads/dataset-tun/wav/20517.wav +20520 downloads/dataset-tun/wav/20520.wav +20523 downloads/dataset-tun/wav/20523.wav +20524 downloads/dataset-tun/wav/20524.wav +20532 downloads/dataset-tun/wav/20532.wav +20535 downloads/dataset-tun/wav/20535.wav +20547 downloads/dataset-tun/wav/20547.wav +20549 downloads/dataset-tun/wav/20549.wav +20554 downloads/dataset-tun/wav/20554.wav +20555 downloads/dataset-tun/wav/20555.wav +20558 downloads/dataset-tun/wav/20558.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.18.scp b/exp/tts_stats_raw_phn_none/logdir/train.18.scp new file mode 100644 index 0000000000000000000000000000000000000000..3c36ae6997afd69f0d6b56526992c752d939783a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.18.scp @@ -0,0 +1,43 @@ +20561 downloads/dataset-tun/wav/20561.wav +20575 downloads/dataset-tun/wav/20575.wav +20576 downloads/dataset-tun/wav/20576.wav +20582 downloads/dataset-tun/wav/20582.wav +20584 downloads/dataset-tun/wav/20584.wav +20587 downloads/dataset-tun/wav/20587.wav +20589 downloads/dataset-tun/wav/20589.wav +20597 downloads/dataset-tun/wav/20597.wav +206 downloads/dataset-tun/wav/0206.wav +20604 downloads/dataset-tun/wav/20604.wav +20605 downloads/dataset-tun/wav/20605.wav +20606 downloads/dataset-tun/wav/20606.wav +20607 downloads/dataset-tun/wav/20607.wav +20616 downloads/dataset-tun/wav/20616.wav +20617 downloads/dataset-tun/wav/20617.wav +20621 downloads/dataset-tun/wav/20621.wav +20622 downloads/dataset-tun/wav/20622.wav +20629 downloads/dataset-tun/wav/20629.wav +20633 downloads/dataset-tun/wav/20633.wav +20635 downloads/dataset-tun/wav/20635.wav +20643 downloads/dataset-tun/wav/20643.wav +20644 downloads/dataset-tun/wav/20644.wav +20645 downloads/dataset-tun/wav/20645.wav +20663 downloads/dataset-tun/wav/20663.wav +20665 downloads/dataset-tun/wav/20665.wav +20668 downloads/dataset-tun/wav/20668.wav +20673 downloads/dataset-tun/wav/20673.wav +20674 downloads/dataset-tun/wav/20674.wav +20675 downloads/dataset-tun/wav/20675.wav +20679 downloads/dataset-tun/wav/20679.wav +20698 downloads/dataset-tun/wav/20698.wav +20699 downloads/dataset-tun/wav/20699.wav +207 downloads/dataset-tun/wav/0207.wav +20702 downloads/dataset-tun/wav/20702.wav +20708 downloads/dataset-tun/wav/20708.wav +20709 downloads/dataset-tun/wav/20709.wav +20710 downloads/dataset-tun/wav/20710.wav +20714 downloads/dataset-tun/wav/20714.wav +20729 downloads/dataset-tun/wav/20729.wav +20739 downloads/dataset-tun/wav/20739.wav +20740 downloads/dataset-tun/wav/20740.wav +20751 downloads/dataset-tun/wav/20751.wav +20754 downloads/dataset-tun/wav/20754.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.19.scp b/exp/tts_stats_raw_phn_none/logdir/train.19.scp new file mode 100644 index 0000000000000000000000000000000000000000..440e0accb7f2a502cf53e7a9cb01bfd8f7f528a1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.19.scp @@ -0,0 +1,43 @@ +20755 downloads/dataset-tun/wav/20755.wav +208 downloads/dataset-tun/wav/0208.wav +209 downloads/dataset-tun/wav/0209.wav +20913 downloads/dataset-tun/wav/20913.wav +20914 downloads/dataset-tun/wav/20914.wav +20923 downloads/dataset-tun/wav/20923.wav +20926 downloads/dataset-tun/wav/20926.wav +20928 downloads/dataset-tun/wav/20928.wav +20935 downloads/dataset-tun/wav/20935.wav +20954 downloads/dataset-tun/wav/20954.wav +20955 downloads/dataset-tun/wav/20955.wav +20959 downloads/dataset-tun/wav/20959.wav +20961 downloads/dataset-tun/wav/20961.wav +20982 downloads/dataset-tun/wav/20982.wav +20988 downloads/dataset-tun/wav/20988.wav +20992 downloads/dataset-tun/wav/20992.wav +21002 downloads/dataset-tun/wav/21002.wav +21005 downloads/dataset-tun/wav/21005.wav +21017 downloads/dataset-tun/wav/21017.wav +21020 downloads/dataset-tun/wav/21020.wav +21022 downloads/dataset-tun/wav/21022.wav +21028 downloads/dataset-tun/wav/21028.wav +21030 downloads/dataset-tun/wav/21030.wav +21033 downloads/dataset-tun/wav/21033.wav +21038 downloads/dataset-tun/wav/21038.wav +21043 downloads/dataset-tun/wav/21043.wav +21045 downloads/dataset-tun/wav/21045.wav +21055 downloads/dataset-tun/wav/21055.wav +21061 downloads/dataset-tun/wav/21061.wav +21072 downloads/dataset-tun/wav/21072.wav +21075 downloads/dataset-tun/wav/21075.wav +21082 downloads/dataset-tun/wav/21082.wav +21084 downloads/dataset-tun/wav/21084.wav +21085 downloads/dataset-tun/wav/21085.wav +21088 downloads/dataset-tun/wav/21088.wav +21097 downloads/dataset-tun/wav/21097.wav +21098 downloads/dataset-tun/wav/21098.wav +211 downloads/dataset-tun/wav/0211.wav +21100 downloads/dataset-tun/wav/21100.wav +21101 downloads/dataset-tun/wav/21101.wav +21103 downloads/dataset-tun/wav/21103.wav +21107 downloads/dataset-tun/wav/21107.wav +21109 downloads/dataset-tun/wav/21109.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.22.scp b/exp/tts_stats_raw_phn_none/logdir/train.22.scp new file mode 100644 index 0000000000000000000000000000000000000000..65a6d549200009a100942679b023c15be474e49a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.22.scp @@ -0,0 +1,43 @@ +21547 downloads/dataset-tun/wav/21547.wav +21560 downloads/dataset-tun/wav/21560.wav +21567 downloads/dataset-tun/wav/21567.wav +21569 downloads/dataset-tun/wav/21569.wav +21571 downloads/dataset-tun/wav/21571.wav +21575 downloads/dataset-tun/wav/21575.wav +21584 downloads/dataset-tun/wav/21584.wav +21590 downloads/dataset-tun/wav/21590.wav +21591 downloads/dataset-tun/wav/21591.wav +21592 downloads/dataset-tun/wav/21592.wav +21596 downloads/dataset-tun/wav/21596.wav +21597 downloads/dataset-tun/wav/21597.wav +21598 downloads/dataset-tun/wav/21598.wav +21604 downloads/dataset-tun/wav/21604.wav +21605 downloads/dataset-tun/wav/21605.wav +21609 downloads/dataset-tun/wav/21609.wav +21623 downloads/dataset-tun/wav/21623.wav +21624 downloads/dataset-tun/wav/21624.wav +21675 downloads/dataset-tun/wav/21675.wav +21726 downloads/dataset-tun/wav/21726.wav +21740 downloads/dataset-tun/wav/21740.wav +21744 downloads/dataset-tun/wav/21744.wav +218 downloads/dataset-tun/wav/0218.wav +219 downloads/dataset-tun/wav/0219.wav +22 downloads/dataset-tun/wav/0022.wav +220 downloads/dataset-tun/wav/0220.wav +222 downloads/dataset-tun/wav/0222.wav +223 downloads/dataset-tun/wav/0223.wav +227 downloads/dataset-tun/wav/0227.wav +229 downloads/dataset-tun/wav/0229.wav +231 downloads/dataset-tun/wav/0231.wav +233 downloads/dataset-tun/wav/0233.wav +234 downloads/dataset-tun/wav/0234.wav +235 downloads/dataset-tun/wav/0235.wav +236 downloads/dataset-tun/wav/0236.wav +238 downloads/dataset-tun/wav/0238.wav +240 downloads/dataset-tun/wav/0240.wav +241 downloads/dataset-tun/wav/0241.wav +242 downloads/dataset-tun/wav/0242.wav +243 downloads/dataset-tun/wav/0243.wav +244 downloads/dataset-tun/wav/0244.wav +246 downloads/dataset-tun/wav/0246.wav +249 downloads/dataset-tun/wav/0249.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.23.scp b/exp/tts_stats_raw_phn_none/logdir/train.23.scp new file mode 100644 index 0000000000000000000000000000000000000000..43cbff7e411aa7344b87e1597196b7dac53d3be8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.23.scp @@ -0,0 +1,43 @@ +250 downloads/dataset-tun/wav/0250.wav +251 downloads/dataset-tun/wav/0251.wav +253 downloads/dataset-tun/wav/0253.wav +254 downloads/dataset-tun/wav/0254.wav +257 downloads/dataset-tun/wav/0257.wav +258 downloads/dataset-tun/wav/0258.wav +259 downloads/dataset-tun/wav/0259.wav +260 downloads/dataset-tun/wav/0260.wav +261 downloads/dataset-tun/wav/0261.wav +264 downloads/dataset-tun/wav/0264.wav +265 downloads/dataset-tun/wav/0265.wav +266 downloads/dataset-tun/wav/0266.wav +268 downloads/dataset-tun/wav/0268.wav +269 downloads/dataset-tun/wav/0269.wav +271 downloads/dataset-tun/wav/0271.wav +272 downloads/dataset-tun/wav/0272.wav +274 downloads/dataset-tun/wav/0274.wav +275 downloads/dataset-tun/wav/0275.wav +277 downloads/dataset-tun/wav/0277.wav +278 downloads/dataset-tun/wav/0278.wav +281 downloads/dataset-tun/wav/0281.wav +282 downloads/dataset-tun/wav/0282.wav +283 downloads/dataset-tun/wav/0283.wav +284 downloads/dataset-tun/wav/0284.wav +288 downloads/dataset-tun/wav/0288.wav +289 downloads/dataset-tun/wav/0289.wav +290 downloads/dataset-tun/wav/0290.wav +293 downloads/dataset-tun/wav/0293.wav +294 downloads/dataset-tun/wav/0294.wav +295 downloads/dataset-tun/wav/0295.wav +297 downloads/dataset-tun/wav/0297.wav +299 downloads/dataset-tun/wav/0299.wav +3 downloads/dataset-tun/wav/0003.wav +300 downloads/dataset-tun/wav/0300.wav +301 downloads/dataset-tun/wav/0301.wav +303 downloads/dataset-tun/wav/0303.wav +304 downloads/dataset-tun/wav/0304.wav +305 downloads/dataset-tun/wav/0305.wav +307 downloads/dataset-tun/wav/0307.wav +308 downloads/dataset-tun/wav/0308.wav +309 downloads/dataset-tun/wav/0309.wav +310 downloads/dataset-tun/wav/0310.wav +311 downloads/dataset-tun/wav/0311.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.24.scp b/exp/tts_stats_raw_phn_none/logdir/train.24.scp new file mode 100644 index 0000000000000000000000000000000000000000..9af523e07ac09c1c699e53dd07ea16f13bdd8442 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.24.scp @@ -0,0 +1,43 @@ +312 downloads/dataset-tun/wav/0312.wav +313 downloads/dataset-tun/wav/0313.wav +314 downloads/dataset-tun/wav/0314.wav +315 downloads/dataset-tun/wav/0315.wav +317 downloads/dataset-tun/wav/0317.wav +318 downloads/dataset-tun/wav/0318.wav +32 downloads/dataset-tun/wav/0032.wav +320 downloads/dataset-tun/wav/0320.wav +322 downloads/dataset-tun/wav/0322.wav +323 downloads/dataset-tun/wav/0323.wav +325 downloads/dataset-tun/wav/0325.wav +328 downloads/dataset-tun/wav/0328.wav +33 downloads/dataset-tun/wav/0033.wav +331 downloads/dataset-tun/wav/0331.wav +333 downloads/dataset-tun/wav/0333.wav +336 downloads/dataset-tun/wav/0336.wav +337 downloads/dataset-tun/wav/0337.wav +338 downloads/dataset-tun/wav/0338.wav +339 downloads/dataset-tun/wav/0339.wav +34 downloads/dataset-tun/wav/0034.wav +340 downloads/dataset-tun/wav/0340.wav +341 downloads/dataset-tun/wav/0341.wav +342 downloads/dataset-tun/wav/0342.wav +345 downloads/dataset-tun/wav/0345.wav +346 downloads/dataset-tun/wav/0346.wav +348 downloads/dataset-tun/wav/0348.wav +35 downloads/dataset-tun/wav/0035.wav +351 downloads/dataset-tun/wav/0351.wav +354 downloads/dataset-tun/wav/0354.wav +356 downloads/dataset-tun/wav/0356.wav +357 downloads/dataset-tun/wav/0357.wav +358 downloads/dataset-tun/wav/0358.wav +36 downloads/dataset-tun/wav/0036.wav +360 downloads/dataset-tun/wav/0360.wav +361 downloads/dataset-tun/wav/0361.wav +362 downloads/dataset-tun/wav/0362.wav +363 downloads/dataset-tun/wav/0363.wav +364 downloads/dataset-tun/wav/0364.wav +365 downloads/dataset-tun/wav/0365.wav +366 downloads/dataset-tun/wav/0366.wav +367 downloads/dataset-tun/wav/0367.wav +37 downloads/dataset-tun/wav/0037.wav +370 downloads/dataset-tun/wav/0370.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.27.scp b/exp/tts_stats_raw_phn_none/logdir/train.27.scp new file mode 100644 index 0000000000000000000000000000000000000000..1247c3419f44f81216ea327665d08441b212d65f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.27.scp @@ -0,0 +1,43 @@ +484 downloads/dataset-tun/wav/0484.wav +485 downloads/dataset-tun/wav/0485.wav +487 downloads/dataset-tun/wav/0487.wav +488 downloads/dataset-tun/wav/0488.wav +489 downloads/dataset-tun/wav/0489.wav +49 downloads/dataset-tun/wav/0049.wav +490 downloads/dataset-tun/wav/0490.wav +492 downloads/dataset-tun/wav/0492.wav +493 downloads/dataset-tun/wav/0493.wav +495 downloads/dataset-tun/wav/0495.wav +496 downloads/dataset-tun/wav/0496.wav +497 downloads/dataset-tun/wav/0497.wav +499 downloads/dataset-tun/wav/0499.wav +5 downloads/dataset-tun/wav/0005.wav +50 downloads/dataset-tun/wav/0050.wav +500 downloads/dataset-tun/wav/0500.wav +501 downloads/dataset-tun/wav/0501.wav +502 downloads/dataset-tun/wav/0502.wav +503 downloads/dataset-tun/wav/0503.wav +505 downloads/dataset-tun/wav/0505.wav +506 downloads/dataset-tun/wav/0506.wav +507 downloads/dataset-tun/wav/0507.wav +509 downloads/dataset-tun/wav/0509.wav +51 downloads/dataset-tun/wav/0051.wav +510 downloads/dataset-tun/wav/0510.wav +513 downloads/dataset-tun/wav/0513.wav +515 downloads/dataset-tun/wav/0515.wav +518 downloads/dataset-tun/wav/0518.wav +519 downloads/dataset-tun/wav/0519.wav +52 downloads/dataset-tun/wav/0052.wav +522 downloads/dataset-tun/wav/0522.wav +523 downloads/dataset-tun/wav/0523.wav +524 downloads/dataset-tun/wav/0524.wav +525 downloads/dataset-tun/wav/0525.wav +528 downloads/dataset-tun/wav/0528.wav +530 downloads/dataset-tun/wav/0530.wav +534 downloads/dataset-tun/wav/0534.wav +535 downloads/dataset-tun/wav/0535.wav +536 downloads/dataset-tun/wav/0536.wav +537 downloads/dataset-tun/wav/0537.wav +552 downloads/dataset-tun/wav/0552.wav +557 downloads/dataset-tun/wav/0557.wav +558 downloads/dataset-tun/wav/0558.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.28.scp b/exp/tts_stats_raw_phn_none/logdir/train.28.scp new file mode 100644 index 0000000000000000000000000000000000000000..213e36c44841314a93206fdf9dc48b35561a5944 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.28.scp @@ -0,0 +1,43 @@ +561 downloads/dataset-tun/wav/0561.wav +562 downloads/dataset-tun/wav/0562.wav +564 downloads/dataset-tun/wav/0564.wav +566 downloads/dataset-tun/wav/0566.wav +567 downloads/dataset-tun/wav/0567.wav +569 downloads/dataset-tun/wav/0569.wav +570 downloads/dataset-tun/wav/0570.wav +571 downloads/dataset-tun/wav/0571.wav +572 downloads/dataset-tun/wav/0572.wav +574 downloads/dataset-tun/wav/0574.wav +577 downloads/dataset-tun/wav/0577.wav +578 downloads/dataset-tun/wav/0578.wav +579 downloads/dataset-tun/wav/0579.wav +58 downloads/dataset-tun/wav/0058.wav +580 downloads/dataset-tun/wav/0580.wav +583 downloads/dataset-tun/wav/0583.wav +584 downloads/dataset-tun/wav/0584.wav +585 downloads/dataset-tun/wav/0585.wav +586 downloads/dataset-tun/wav/0586.wav +587 downloads/dataset-tun/wav/0587.wav +590 downloads/dataset-tun/wav/0590.wav +591 downloads/dataset-tun/wav/0591.wav +592 downloads/dataset-tun/wav/0592.wav +593 downloads/dataset-tun/wav/0593.wav +595 downloads/dataset-tun/wav/0595.wav +596 downloads/dataset-tun/wav/0596.wav +598 downloads/dataset-tun/wav/0598.wav +599 downloads/dataset-tun/wav/0599.wav +6 downloads/dataset-tun/wav/0006.wav +60 downloads/dataset-tun/wav/0060.wav +601 downloads/dataset-tun/wav/0601.wav +602 downloads/dataset-tun/wav/0602.wav +603 downloads/dataset-tun/wav/0603.wav +604 downloads/dataset-tun/wav/0604.wav +605 downloads/dataset-tun/wav/0605.wav +606 downloads/dataset-tun/wav/0606.wav +607 downloads/dataset-tun/wav/0607.wav +608 downloads/dataset-tun/wav/0608.wav +609 downloads/dataset-tun/wav/0609.wav +61 downloads/dataset-tun/wav/0061.wav +610 downloads/dataset-tun/wav/0610.wav +611 downloads/dataset-tun/wav/0611.wav +613 downloads/dataset-tun/wav/0613.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.29.scp b/exp/tts_stats_raw_phn_none/logdir/train.29.scp new file mode 100644 index 0000000000000000000000000000000000000000..a4bf10c6095c6306f96d6058a2d52413f3fa8749 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.29.scp @@ -0,0 +1,43 @@ +614 downloads/dataset-tun/wav/0614.wav +617 downloads/dataset-tun/wav/0617.wav +619 downloads/dataset-tun/wav/0619.wav +62 downloads/dataset-tun/wav/0062.wav +620 downloads/dataset-tun/wav/0620.wav +621 downloads/dataset-tun/wav/0621.wav +622 downloads/dataset-tun/wav/0622.wav +623 downloads/dataset-tun/wav/0623.wav +624 downloads/dataset-tun/wav/0624.wav +625 downloads/dataset-tun/wav/0625.wav +626 downloads/dataset-tun/wav/0626.wav +627 downloads/dataset-tun/wav/0627.wav +629 downloads/dataset-tun/wav/0629.wav +63 downloads/dataset-tun/wav/0063.wav +630 downloads/dataset-tun/wav/0630.wav +631 downloads/dataset-tun/wav/0631.wav +632 downloads/dataset-tun/wav/0632.wav +633 downloads/dataset-tun/wav/0633.wav +636 downloads/dataset-tun/wav/0636.wav +639 downloads/dataset-tun/wav/0639.wav +640 downloads/dataset-tun/wav/0640.wav +641 downloads/dataset-tun/wav/0641.wav +643 downloads/dataset-tun/wav/0643.wav +644 downloads/dataset-tun/wav/0644.wav +645 downloads/dataset-tun/wav/0645.wav +646 downloads/dataset-tun/wav/0646.wav +647 downloads/dataset-tun/wav/0647.wav +648 downloads/dataset-tun/wav/0648.wav +649 downloads/dataset-tun/wav/0649.wav +65 downloads/dataset-tun/wav/0065.wav +650 downloads/dataset-tun/wav/0650.wav +651 downloads/dataset-tun/wav/0651.wav +655 downloads/dataset-tun/wav/0655.wav +656 downloads/dataset-tun/wav/0656.wav +658 downloads/dataset-tun/wav/0658.wav +659 downloads/dataset-tun/wav/0659.wav +660 downloads/dataset-tun/wav/0660.wav +661 downloads/dataset-tun/wav/0661.wav +665 downloads/dataset-tun/wav/0665.wav +666 downloads/dataset-tun/wav/0666.wav +667 downloads/dataset-tun/wav/0667.wav +668 downloads/dataset-tun/wav/0668.wav +669 downloads/dataset-tun/wav/0669.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.30.scp b/exp/tts_stats_raw_phn_none/logdir/train.30.scp new file mode 100644 index 0000000000000000000000000000000000000000..ca1273c16cc3c7c6e985963a5b39c471fb11a82a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.30.scp @@ -0,0 +1,43 @@ +670 downloads/dataset-tun/wav/0670.wav +671 downloads/dataset-tun/wav/0671.wav +674 downloads/dataset-tun/wav/0674.wav +675 downloads/dataset-tun/wav/0675.wav +676 downloads/dataset-tun/wav/0676.wav +677 downloads/dataset-tun/wav/0677.wav +679 downloads/dataset-tun/wav/0679.wav +68 downloads/dataset-tun/wav/0068.wav +680 downloads/dataset-tun/wav/0680.wav +681 downloads/dataset-tun/wav/0681.wav +684 downloads/dataset-tun/wav/0684.wav +685 downloads/dataset-tun/wav/0685.wav +686 downloads/dataset-tun/wav/0686.wav +688 downloads/dataset-tun/wav/0688.wav +69 downloads/dataset-tun/wav/0069.wav +690 downloads/dataset-tun/wav/0690.wav +691 downloads/dataset-tun/wav/0691.wav +693 downloads/dataset-tun/wav/0693.wav +694 downloads/dataset-tun/wav/0694.wav +696 downloads/dataset-tun/wav/0696.wav +697 downloads/dataset-tun/wav/0697.wav +699 downloads/dataset-tun/wav/0699.wav +70 downloads/dataset-tun/wav/0070.wav +701 downloads/dataset-tun/wav/0701.wav +702 downloads/dataset-tun/wav/0702.wav +706 downloads/dataset-tun/wav/0706.wav +707 downloads/dataset-tun/wav/0707.wav +708 downloads/dataset-tun/wav/0708.wav +71 downloads/dataset-tun/wav/0071.wav +711 downloads/dataset-tun/wav/0711.wav +712 downloads/dataset-tun/wav/0712.wav +713 downloads/dataset-tun/wav/0713.wav +714 downloads/dataset-tun/wav/0714.wav +715 downloads/dataset-tun/wav/0715.wav +716 downloads/dataset-tun/wav/0716.wav +717 downloads/dataset-tun/wav/0717.wav +718 downloads/dataset-tun/wav/0718.wav +719 downloads/dataset-tun/wav/0719.wav +72 downloads/dataset-tun/wav/0072.wav +720 downloads/dataset-tun/wav/0720.wav +721 downloads/dataset-tun/wav/0721.wav +723 downloads/dataset-tun/wav/0723.wav +724 downloads/dataset-tun/wav/0724.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.31.scp b/exp/tts_stats_raw_phn_none/logdir/train.31.scp new file mode 100644 index 0000000000000000000000000000000000000000..d107018fac3f062e544e8ae891fdd4468ae92552 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.31.scp @@ -0,0 +1,43 @@ +725 downloads/dataset-tun/wav/0725.wav +726 downloads/dataset-tun/wav/0726.wav +727 downloads/dataset-tun/wav/0727.wav +728 downloads/dataset-tun/wav/0728.wav +729 downloads/dataset-tun/wav/0729.wav +730 downloads/dataset-tun/wav/0730.wav +731 downloads/dataset-tun/wav/0731.wav +732 downloads/dataset-tun/wav/0732.wav +733 downloads/dataset-tun/wav/0733.wav +735 downloads/dataset-tun/wav/0735.wav +736 downloads/dataset-tun/wav/0736.wav +74 downloads/dataset-tun/wav/0074.wav +740 downloads/dataset-tun/wav/0740.wav +741 downloads/dataset-tun/wav/0741.wav +744 downloads/dataset-tun/wav/0744.wav +746 downloads/dataset-tun/wav/0746.wav +747 downloads/dataset-tun/wav/0747.wav +749 downloads/dataset-tun/wav/0749.wav +75 downloads/dataset-tun/wav/0075.wav +750 downloads/dataset-tun/wav/0750.wav +751 downloads/dataset-tun/wav/0751.wav +752 downloads/dataset-tun/wav/0752.wav +753 downloads/dataset-tun/wav/0753.wav +754 downloads/dataset-tun/wav/0754.wav +755 downloads/dataset-tun/wav/0755.wav +756 downloads/dataset-tun/wav/0756.wav +757 downloads/dataset-tun/wav/0757.wav +758 downloads/dataset-tun/wav/0758.wav +759 downloads/dataset-tun/wav/0759.wav +76 downloads/dataset-tun/wav/0076.wav +761 downloads/dataset-tun/wav/0761.wav +762 downloads/dataset-tun/wav/0762.wav +763 downloads/dataset-tun/wav/0763.wav +765 downloads/dataset-tun/wav/0765.wav +766 downloads/dataset-tun/wav/0766.wav +767 downloads/dataset-tun/wav/0767.wav +768 downloads/dataset-tun/wav/0768.wav +769 downloads/dataset-tun/wav/0769.wav +77 downloads/dataset-tun/wav/0077.wav +771 downloads/dataset-tun/wav/0771.wav +772 downloads/dataset-tun/wav/0772.wav +773 downloads/dataset-tun/wav/0773.wav +774 downloads/dataset-tun/wav/0774.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.32.scp b/exp/tts_stats_raw_phn_none/logdir/train.32.scp new file mode 100644 index 0000000000000000000000000000000000000000..1729257a1fef69200db258a45b04eabd7570298a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.32.scp @@ -0,0 +1,43 @@ +775 downloads/dataset-tun/wav/0775.wav +776 downloads/dataset-tun/wav/0776.wav +777 downloads/dataset-tun/wav/0777.wav +78 downloads/dataset-tun/wav/0078.wav +780 downloads/dataset-tun/wav/0780.wav +781 downloads/dataset-tun/wav/0781.wav +782 downloads/dataset-tun/wav/0782.wav +783 downloads/dataset-tun/wav/0783.wav +784 downloads/dataset-tun/wav/0784.wav +785 downloads/dataset-tun/wav/0785.wav +786 downloads/dataset-tun/wav/0786.wav +787 downloads/dataset-tun/wav/0787.wav +788 downloads/dataset-tun/wav/0788.wav +79 downloads/dataset-tun/wav/0079.wav +790 downloads/dataset-tun/wav/0790.wav +791 downloads/dataset-tun/wav/0791.wav +792 downloads/dataset-tun/wav/0792.wav +793 downloads/dataset-tun/wav/0793.wav +795 downloads/dataset-tun/wav/0795.wav +798 downloads/dataset-tun/wav/0798.wav +799 downloads/dataset-tun/wav/0799.wav +8 downloads/dataset-tun/wav/0008.wav +80 downloads/dataset-tun/wav/0080.wav +802 downloads/dataset-tun/wav/0802.wav +803 downloads/dataset-tun/wav/0803.wav +804 downloads/dataset-tun/wav/0804.wav +805 downloads/dataset-tun/wav/0805.wav +810 downloads/dataset-tun/wav/0810.wav +811 downloads/dataset-tun/wav/0811.wav +812 downloads/dataset-tun/wav/0812.wav +813 downloads/dataset-tun/wav/0813.wav +82 downloads/dataset-tun/wav/0082.wav +83 downloads/dataset-tun/wav/0083.wav +86 downloads/dataset-tun/wav/0086.wav +87 downloads/dataset-tun/wav/0087.wav +90 downloads/dataset-tun/wav/0090.wav +91 downloads/dataset-tun/wav/0091.wav +93 downloads/dataset-tun/wav/0093.wav +94 downloads/dataset-tun/wav/0094.wav +95 downloads/dataset-tun/wav/0095.wav +96 downloads/dataset-tun/wav/0096.wav +98 downloads/dataset-tun/wav/0098.wav +99 downloads/dataset-tun/wav/0099.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.4.scp b/exp/tts_stats_raw_phn_none/logdir/train.4.scp new file mode 100644 index 0000000000000000000000000000000000000000..85606420f8341012f634b2875a54964c94ed1528 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.4.scp @@ -0,0 +1,44 @@ +15004 downloads/dataset-tun/wav/15004.wav +15006 downloads/dataset-tun/wav/15006.wav +15010 downloads/dataset-tun/wav/15010.wav +15016 downloads/dataset-tun/wav/15016.wav +15018 downloads/dataset-tun/wav/15018.wav +15019 downloads/dataset-tun/wav/15019.wav +15020 downloads/dataset-tun/wav/15020.wav +15022 downloads/dataset-tun/wav/15022.wav +15025 downloads/dataset-tun/wav/15025.wav +15026 downloads/dataset-tun/wav/15026.wav +15034 downloads/dataset-tun/wav/15034.wav +15042 downloads/dataset-tun/wav/15042.wav +15043 downloads/dataset-tun/wav/15043.wav +15044 downloads/dataset-tun/wav/15044.wav +15086 downloads/dataset-tun/wav/15086.wav +151 downloads/dataset-tun/wav/0151.wav +15195 downloads/dataset-tun/wav/15195.wav +15202 downloads/dataset-tun/wav/15202.wav +15207 downloads/dataset-tun/wav/15207.wav +15212 downloads/dataset-tun/wav/15212.wav +15221 downloads/dataset-tun/wav/15221.wav +15225 downloads/dataset-tun/wav/15225.wav +15268 downloads/dataset-tun/wav/15268.wav +15284 downloads/dataset-tun/wav/15284.wav +15285 downloads/dataset-tun/wav/15285.wav +153 downloads/dataset-tun/wav/0153.wav +15322 downloads/dataset-tun/wav/15322.wav +15483 downloads/dataset-tun/wav/15483.wav +155 downloads/dataset-tun/wav/0155.wav +15501 downloads/dataset-tun/wav/15501.wav +15518 downloads/dataset-tun/wav/15518.wav +15520 downloads/dataset-tun/wav/15520.wav +156 downloads/dataset-tun/wav/0156.wav +15637 downloads/dataset-tun/wav/15637.wav +15638 downloads/dataset-tun/wav/15638.wav +15640 downloads/dataset-tun/wav/15640.wav +15658 downloads/dataset-tun/wav/15658.wav +15687 downloads/dataset-tun/wav/15687.wav +157 downloads/dataset-tun/wav/0157.wav +15703 downloads/dataset-tun/wav/15703.wav +15706 downloads/dataset-tun/wav/15706.wav +15707 downloads/dataset-tun/wav/15707.wav +15720 downloads/dataset-tun/wav/15720.wav +15721 downloads/dataset-tun/wav/15721.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.6.scp b/exp/tts_stats_raw_phn_none/logdir/train.6.scp new file mode 100644 index 0000000000000000000000000000000000000000..ee929a038dc8bf09412164c16eb5189095b74e7a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.6.scp @@ -0,0 +1,43 @@ +18132 downloads/dataset-tun/wav/18132.wav +18133 downloads/dataset-tun/wav/18133.wav +18134 downloads/dataset-tun/wav/18134.wav +18135 downloads/dataset-tun/wav/18135.wav +18136 downloads/dataset-tun/wav/18136.wav +18137 downloads/dataset-tun/wav/18137.wav +18149 downloads/dataset-tun/wav/18149.wav +18150 downloads/dataset-tun/wav/18150.wav +18151 downloads/dataset-tun/wav/18151.wav +18155 downloads/dataset-tun/wav/18155.wav +18156 downloads/dataset-tun/wav/18156.wav +18157 downloads/dataset-tun/wav/18157.wav +18158 downloads/dataset-tun/wav/18158.wav +18159 downloads/dataset-tun/wav/18159.wav +18160 downloads/dataset-tun/wav/18160.wav +18161 downloads/dataset-tun/wav/18161.wav +18162 downloads/dataset-tun/wav/18162.wav +18163 downloads/dataset-tun/wav/18163.wav +18164 downloads/dataset-tun/wav/18164.wav +18166 downloads/dataset-tun/wav/18166.wav +18168 downloads/dataset-tun/wav/18168.wav +18169 downloads/dataset-tun/wav/18169.wav +18171 downloads/dataset-tun/wav/18171.wav +18172 downloads/dataset-tun/wav/18172.wav +18176 downloads/dataset-tun/wav/18176.wav +18177 downloads/dataset-tun/wav/18177.wav +18178 downloads/dataset-tun/wav/18178.wav +18180 downloads/dataset-tun/wav/18180.wav +18181 downloads/dataset-tun/wav/18181.wav +18184 downloads/dataset-tun/wav/18184.wav +18185 downloads/dataset-tun/wav/18185.wav +18187 downloads/dataset-tun/wav/18187.wav +18190 downloads/dataset-tun/wav/18190.wav +18194 downloads/dataset-tun/wav/18194.wav +182 downloads/dataset-tun/wav/0182.wav +18215 downloads/dataset-tun/wav/18215.wav +18216 downloads/dataset-tun/wav/18216.wav +18219 downloads/dataset-tun/wav/18219.wav +18220 downloads/dataset-tun/wav/18220.wav +18223 downloads/dataset-tun/wav/18223.wav +18224 downloads/dataset-tun/wav/18224.wav +18225 downloads/dataset-tun/wav/18225.wav +18227 downloads/dataset-tun/wav/18227.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.7.scp b/exp/tts_stats_raw_phn_none/logdir/train.7.scp new file mode 100644 index 0000000000000000000000000000000000000000..49ee6ba7eda48f00250e71b9ea665a37c5e0e20a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.7.scp @@ -0,0 +1,43 @@ +18230 downloads/dataset-tun/wav/18230.wav +18231 downloads/dataset-tun/wav/18231.wav +18232 downloads/dataset-tun/wav/18232.wav +18233 downloads/dataset-tun/wav/18233.wav +18234 downloads/dataset-tun/wav/18234.wav +18235 downloads/dataset-tun/wav/18235.wav +18238 downloads/dataset-tun/wav/18238.wav +18239 downloads/dataset-tun/wav/18239.wav +18240 downloads/dataset-tun/wav/18240.wav +18242 downloads/dataset-tun/wav/18242.wav +18244 downloads/dataset-tun/wav/18244.wav +18247 downloads/dataset-tun/wav/18247.wav +18252 downloads/dataset-tun/wav/18252.wav +18296 downloads/dataset-tun/wav/18296.wav +183 downloads/dataset-tun/wav/0183.wav +18307 downloads/dataset-tun/wav/18307.wav +18308 downloads/dataset-tun/wav/18308.wav +18326 downloads/dataset-tun/wav/18326.wav +18337 downloads/dataset-tun/wav/18337.wav +18356 downloads/dataset-tun/wav/18356.wav +18375 downloads/dataset-tun/wav/18375.wav +18377 downloads/dataset-tun/wav/18377.wav +18378 downloads/dataset-tun/wav/18378.wav +18380 downloads/dataset-tun/wav/18380.wav +18387 downloads/dataset-tun/wav/18387.wav +18392 downloads/dataset-tun/wav/18392.wav +18394 downloads/dataset-tun/wav/18394.wav +184 downloads/dataset-tun/wav/0184.wav +18402 downloads/dataset-tun/wav/18402.wav +18406 downloads/dataset-tun/wav/18406.wav +18410 downloads/dataset-tun/wav/18410.wav +18415 downloads/dataset-tun/wav/18415.wav +18418 downloads/dataset-tun/wav/18418.wav +18423 downloads/dataset-tun/wav/18423.wav +18427 downloads/dataset-tun/wav/18427.wav +18432 downloads/dataset-tun/wav/18432.wav +18438 downloads/dataset-tun/wav/18438.wav +18446 downloads/dataset-tun/wav/18446.wav +18455 downloads/dataset-tun/wav/18455.wav +18458 downloads/dataset-tun/wav/18458.wav +18470 downloads/dataset-tun/wav/18470.wav +18471 downloads/dataset-tun/wav/18471.wav +18475 downloads/dataset-tun/wav/18475.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.8.scp b/exp/tts_stats_raw_phn_none/logdir/train.8.scp new file mode 100644 index 0000000000000000000000000000000000000000..8434ab210ba391575cde847a444d6363601d63cb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.8.scp @@ -0,0 +1,43 @@ +18482 downloads/dataset-tun/wav/18482.wav +18483 downloads/dataset-tun/wav/18483.wav +18497 downloads/dataset-tun/wav/18497.wav +18498 downloads/dataset-tun/wav/18498.wav +185 downloads/dataset-tun/wav/0185.wav +18503 downloads/dataset-tun/wav/18503.wav +18509 downloads/dataset-tun/wav/18509.wav +18512 downloads/dataset-tun/wav/18512.wav +18535 downloads/dataset-tun/wav/18535.wav +18538 downloads/dataset-tun/wav/18538.wav +18541 downloads/dataset-tun/wav/18541.wav +18543 downloads/dataset-tun/wav/18543.wav +18546 downloads/dataset-tun/wav/18546.wav +18548 downloads/dataset-tun/wav/18548.wav +18552 downloads/dataset-tun/wav/18552.wav +18553 downloads/dataset-tun/wav/18553.wav +18559 downloads/dataset-tun/wav/18559.wav +18562 downloads/dataset-tun/wav/18562.wav +18564 downloads/dataset-tun/wav/18564.wav +18567 downloads/dataset-tun/wav/18567.wav +18568 downloads/dataset-tun/wav/18568.wav +18569 downloads/dataset-tun/wav/18569.wav +18570 downloads/dataset-tun/wav/18570.wav +18571 downloads/dataset-tun/wav/18571.wav +18578 downloads/dataset-tun/wav/18578.wav +18579 downloads/dataset-tun/wav/18579.wav +18580 downloads/dataset-tun/wav/18580.wav +18582 downloads/dataset-tun/wav/18582.wav +18584 downloads/dataset-tun/wav/18584.wav +18590 downloads/dataset-tun/wav/18590.wav +186 downloads/dataset-tun/wav/0186.wav +18601 downloads/dataset-tun/wav/18601.wav +18606 downloads/dataset-tun/wav/18606.wav +18610 downloads/dataset-tun/wav/18610.wav +18624 downloads/dataset-tun/wav/18624.wav +18639 downloads/dataset-tun/wav/18639.wav +18641 downloads/dataset-tun/wav/18641.wav +18642 downloads/dataset-tun/wav/18642.wav +18646 downloads/dataset-tun/wav/18646.wav +18655 downloads/dataset-tun/wav/18655.wav +18658 downloads/dataset-tun/wav/18658.wav +18670 downloads/dataset-tun/wav/18670.wav +18680 downloads/dataset-tun/wav/18680.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/train.9.scp b/exp/tts_stats_raw_phn_none/logdir/train.9.scp new file mode 100644 index 0000000000000000000000000000000000000000..86b27ab2d1c70be34d4563c08b2783723f515ce9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/train.9.scp @@ -0,0 +1,43 @@ +18681 downloads/dataset-tun/wav/18681.wav +18686 downloads/dataset-tun/wav/18686.wav +187 downloads/dataset-tun/wav/0187.wav +18706 downloads/dataset-tun/wav/18706.wav +18707 downloads/dataset-tun/wav/18707.wav +18709 downloads/dataset-tun/wav/18709.wav +18711 downloads/dataset-tun/wav/18711.wav +18720 downloads/dataset-tun/wav/18720.wav +18747 downloads/dataset-tun/wav/18747.wav +18755 downloads/dataset-tun/wav/18755.wav +18758 downloads/dataset-tun/wav/18758.wav +18760 downloads/dataset-tun/wav/18760.wav +18768 downloads/dataset-tun/wav/18768.wav +18769 downloads/dataset-tun/wav/18769.wav +18772 downloads/dataset-tun/wav/18772.wav +18786 downloads/dataset-tun/wav/18786.wav +18796 downloads/dataset-tun/wav/18796.wav +18804 downloads/dataset-tun/wav/18804.wav +18813 downloads/dataset-tun/wav/18813.wav +18835 downloads/dataset-tun/wav/18835.wav +18837 downloads/dataset-tun/wav/18837.wav +18838 downloads/dataset-tun/wav/18838.wav +18841 downloads/dataset-tun/wav/18841.wav +18851 downloads/dataset-tun/wav/18851.wav +18858 downloads/dataset-tun/wav/18858.wav +18861 downloads/dataset-tun/wav/18861.wav +18862 downloads/dataset-tun/wav/18862.wav +18867 downloads/dataset-tun/wav/18867.wav +18870 downloads/dataset-tun/wav/18870.wav +18873 downloads/dataset-tun/wav/18873.wav +18886 downloads/dataset-tun/wav/18886.wav +18893 downloads/dataset-tun/wav/18893.wav +18899 downloads/dataset-tun/wav/18899.wav +189 downloads/dataset-tun/wav/0189.wav +18904 downloads/dataset-tun/wav/18904.wav +18906 downloads/dataset-tun/wav/18906.wav +18908 downloads/dataset-tun/wav/18908.wav +18909 downloads/dataset-tun/wav/18909.wav +18910 downloads/dataset-tun/wav/18910.wav +18912 downloads/dataset-tun/wav/18912.wav +18915 downloads/dataset-tun/wav/18915.wav +18919 downloads/dataset-tun/wav/18919.wav +18934 downloads/dataset-tun/wav/18934.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.1.scp b/exp/tts_stats_raw_phn_none/logdir/valid.1.scp new file mode 100644 index 0000000000000000000000000000000000000000..4809e85d6f205a80c392671ebe48eab0ee98de1d --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.1.scp @@ -0,0 +1,3 @@ +115 downloads/dataset-tun/wav/0115.wav +119 downloads/dataset-tun/wav/0119.wav +120 downloads/dataset-tun/wav/0120.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.10.scp b/exp/tts_stats_raw_phn_none/logdir/valid.10.scp new file mode 100644 index 0000000000000000000000000000000000000000..8c02c7ec7cd254cacf4ff1b323cf475bdc7d7cc9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.10.scp @@ -0,0 +1,2 @@ +169 downloads/dataset-tun/wav/0169.wav +18237 downloads/dataset-tun/wav/18237.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.11.scp b/exp/tts_stats_raw_phn_none/logdir/valid.11.scp new file mode 100644 index 0000000000000000000000000000000000000000..e75d80fd42efc3eaa17e4e7608672c749f5ee6fd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.11.scp @@ -0,0 +1,2 @@ +18774 downloads/dataset-tun/wav/18774.wav +18913 downloads/dataset-tun/wav/18913.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.12.scp b/exp/tts_stats_raw_phn_none/logdir/valid.12.scp new file mode 100644 index 0000000000000000000000000000000000000000..96b8944f8f2fd979ee708e0f75e4e25491db6256 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.12.scp @@ -0,0 +1,2 @@ +18963 downloads/dataset-tun/wav/18963.wav +19178 downloads/dataset-tun/wav/19178.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.13.scp b/exp/tts_stats_raw_phn_none/logdir/valid.13.scp new file mode 100644 index 0000000000000000000000000000000000000000..ec234be190fbf1795711a5b8691ff7c588534e0a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.13.scp @@ -0,0 +1,2 @@ +19312 downloads/dataset-tun/wav/19312.wav +19335 downloads/dataset-tun/wav/19335.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.14.scp b/exp/tts_stats_raw_phn_none/logdir/valid.14.scp new file mode 100644 index 0000000000000000000000000000000000000000..0c0ce424af9241370c86af58b8d076fd24d2b1dd --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.14.scp @@ -0,0 +1,2 @@ +19517 downloads/dataset-tun/wav/19517.wav +19536 downloads/dataset-tun/wav/19536.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.15.scp b/exp/tts_stats_raw_phn_none/logdir/valid.15.scp new file mode 100644 index 0000000000000000000000000000000000000000..9ed82b9401421907259e3d1f3132c6df499f45b1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.15.scp @@ -0,0 +1,2 @@ +19769 downloads/dataset-tun/wav/19769.wav +19771 downloads/dataset-tun/wav/19771.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.16.scp b/exp/tts_stats_raw_phn_none/logdir/valid.16.scp new file mode 100644 index 0000000000000000000000000000000000000000..ba42ff3656b13e3bbcb1afb49a9ad6955ca850b1 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.16.scp @@ -0,0 +1,2 @@ +19975 downloads/dataset-tun/wav/19975.wav +20258 downloads/dataset-tun/wav/20258.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.17.scp b/exp/tts_stats_raw_phn_none/logdir/valid.17.scp new file mode 100644 index 0000000000000000000000000000000000000000..d7a54c41a90ab09e512d0ee0c7836da4c81e055f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.17.scp @@ -0,0 +1,2 @@ +20265 downloads/dataset-tun/wav/20265.wav +20613 downloads/dataset-tun/wav/20613.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.18.scp b/exp/tts_stats_raw_phn_none/logdir/valid.18.scp new file mode 100644 index 0000000000000000000000000000000000000000..7e1800a6c63c8c4646505f8cb63b6e40ca9f1c50 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.18.scp @@ -0,0 +1,2 @@ +20642 downloads/dataset-tun/wav/20642.wav +20701 downloads/dataset-tun/wav/20701.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.19.scp b/exp/tts_stats_raw_phn_none/logdir/valid.19.scp new file mode 100644 index 0000000000000000000000000000000000000000..c076fc72d67770e6083803aed58eb4bc0d21a8f8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.19.scp @@ -0,0 +1,2 @@ +21 downloads/dataset-tun/wav/0021.wav +21440 downloads/dataset-tun/wav/21440.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.2.scp b/exp/tts_stats_raw_phn_none/logdir/valid.2.scp new file mode 100644 index 0000000000000000000000000000000000000000..7867e0499097e1cbde5173481c6e35713d4f449f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.2.scp @@ -0,0 +1,2 @@ +123 downloads/dataset-tun/wav/0123.wav +125 downloads/dataset-tun/wav/0125.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.20.scp b/exp/tts_stats_raw_phn_none/logdir/valid.20.scp new file mode 100644 index 0000000000000000000000000000000000000000..04f4a58f9df91f7a0c48e5949ba46c425efbf31a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.20.scp @@ -0,0 +1,2 @@ +21499 downloads/dataset-tun/wav/21499.wav +21601 downloads/dataset-tun/wav/21601.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.21.scp b/exp/tts_stats_raw_phn_none/logdir/valid.21.scp new file mode 100644 index 0000000000000000000000000000000000000000..11b8d02a0d8973038d916781d8cf41a930c111c5 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.21.scp @@ -0,0 +1,2 @@ +280 downloads/dataset-tun/wav/0280.wav +286 downloads/dataset-tun/wav/0286.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.22.scp b/exp/tts_stats_raw_phn_none/logdir/valid.22.scp new file mode 100644 index 0000000000000000000000000000000000000000..1b1d8850bb8ba12d9b5850e18e19d7ad19f6eb6a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.22.scp @@ -0,0 +1,2 @@ +287 downloads/dataset-tun/wav/0287.wav +296 downloads/dataset-tun/wav/0296.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.24.scp b/exp/tts_stats_raw_phn_none/logdir/valid.24.scp new file mode 100644 index 0000000000000000000000000000000000000000..6c3f4acb8063575c2e17e2df04c03f0f7cc97d83 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.24.scp @@ -0,0 +1,2 @@ +531 downloads/dataset-tun/wav/0531.wav +538 downloads/dataset-tun/wav/0538.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.25.scp b/exp/tts_stats_raw_phn_none/logdir/valid.25.scp new file mode 100644 index 0000000000000000000000000000000000000000..e908c6d93564292c5bcd89128f0c6bf4e4dbc8e8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.25.scp @@ -0,0 +1,2 @@ +539 downloads/dataset-tun/wav/0539.wav +540 downloads/dataset-tun/wav/0540.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.26.scp b/exp/tts_stats_raw_phn_none/logdir/valid.26.scp new file mode 100644 index 0000000000000000000000000000000000000000..f7ccf27f3710303ceca33ca79576d5bdf49aca10 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.26.scp @@ -0,0 +1,2 @@ +545 downloads/dataset-tun/wav/0545.wav +547 downloads/dataset-tun/wav/0547.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.28.scp b/exp/tts_stats_raw_phn_none/logdir/valid.28.scp new file mode 100644 index 0000000000000000000000000000000000000000..38085e26585a2176360ef90bebadf75249a5c25c --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.28.scp @@ -0,0 +1,2 @@ +554 downloads/dataset-tun/wav/0554.wav +559 downloads/dataset-tun/wav/0559.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.29.scp b/exp/tts_stats_raw_phn_none/logdir/valid.29.scp new file mode 100644 index 0000000000000000000000000000000000000000..eeb5656c8e3769be781afbdbf7b1be5f7f9ca4fa --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.29.scp @@ -0,0 +1,2 @@ +560 downloads/dataset-tun/wav/0560.wav +588 downloads/dataset-tun/wav/0588.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.3.scp b/exp/tts_stats_raw_phn_none/logdir/valid.3.scp new file mode 100644 index 0000000000000000000000000000000000000000..c69e15637736682162b4702f55f9a8a435d8e86a --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.3.scp @@ -0,0 +1,2 @@ +126 downloads/dataset-tun/wav/0126.wav +128 downloads/dataset-tun/wav/0128.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.31.scp b/exp/tts_stats_raw_phn_none/logdir/valid.31.scp new file mode 100644 index 0000000000000000000000000000000000000000..7c5398c02e0232c8ced1bd7e9e79f53e15a83849 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.31.scp @@ -0,0 +1,2 @@ +678 downloads/dataset-tun/wav/0678.wav +698 downloads/dataset-tun/wav/0698.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.32.scp b/exp/tts_stats_raw_phn_none/logdir/valid.32.scp new file mode 100644 index 0000000000000000000000000000000000000000..2378b3d91bdadc48f2ecef24c9fbb9f0edca39d8 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.32.scp @@ -0,0 +1,2 @@ +739 downloads/dataset-tun/wav/0739.wav +808 downloads/dataset-tun/wav/0808.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.4.scp b/exp/tts_stats_raw_phn_none/logdir/valid.4.scp new file mode 100644 index 0000000000000000000000000000000000000000..157caa32132949feb059f868501930b3d87439b4 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.4.scp @@ -0,0 +1,2 @@ +129 downloads/dataset-tun/wav/0129.wav +130 downloads/dataset-tun/wav/0130.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.5.scp b/exp/tts_stats_raw_phn_none/logdir/valid.5.scp new file mode 100644 index 0000000000000000000000000000000000000000..6fe04806311ee90a58543f3577407174ed5ae030 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.5.scp @@ -0,0 +1,2 @@ +131 downloads/dataset-tun/wav/0131.wav +14616 downloads/dataset-tun/wav/14616.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.6.scp b/exp/tts_stats_raw_phn_none/logdir/valid.6.scp new file mode 100644 index 0000000000000000000000000000000000000000..68f96b67eda25c47f726b63968657ec9c5ed64ec --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.6.scp @@ -0,0 +1,2 @@ +14849 downloads/dataset-tun/wav/14849.wav +14891 downloads/dataset-tun/wav/14891.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.7.scp b/exp/tts_stats_raw_phn_none/logdir/valid.7.scp new file mode 100644 index 0000000000000000000000000000000000000000..dc07f1bd937af713d34df62bbeaf1faa253d8f87 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.7.scp @@ -0,0 +1,2 @@ +14941 downloads/dataset-tun/wav/14941.wav +14991 downloads/dataset-tun/wav/14991.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.8.scp b/exp/tts_stats_raw_phn_none/logdir/valid.8.scp new file mode 100644 index 0000000000000000000000000000000000000000..e4eeb37bd7b52b933fc82a4160472495ddecc000 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.8.scp @@ -0,0 +1,2 @@ +15003 downloads/dataset-tun/wav/15003.wav +15079 downloads/dataset-tun/wav/15079.wav diff --git a/exp/tts_stats_raw_phn_none/logdir/valid.9.scp b/exp/tts_stats_raw_phn_none/logdir/valid.9.scp new file mode 100644 index 0000000000000000000000000000000000000000..88a334a22d4fd6e623dec17a4727a263aa9dacbb --- /dev/null +++ b/exp/tts_stats_raw_phn_none/logdir/valid.9.scp @@ -0,0 +1,2 @@ +15269 downloads/dataset-tun/wav/15269.wav +15665 downloads/dataset-tun/wav/15665.wav diff --git a/exp/tts_stats_raw_phn_none/run.sh b/exp/tts_stats_raw_phn_none/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..8e1395c4ed118feedfe2239c8cadcd253eb17c0f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/run.sh @@ -0,0 +1 @@ +./tts.sh --stage 1 --lang ar --feats_type raw --fs 22050 --n_fft 1024 --n_shift 256 --token_type phn --cleaner none --g2p none --train_config conf/tuning/finetune_tacotron2.yaml --inference_config conf/tuning/decode_tacotron2.yaml --train_set train --valid_set dev --test_sets test --srctexts data/train/text --audio_format wav --stage 2 --stop-stage 5 --stage 5 "$@"; exit $? diff --git a/exp/tts_stats_raw_phn_none/train/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/train/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..a96e38b7686a62e33bf1e1d2f4a361dca33b7873 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/train/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf5739cd6e4700c0ada7d22c02cae4226b020c651472e6e05c10d4913b0135aa +size 778 diff --git a/exp/tts_stats_raw_phn_none/train/feats_stats.npz b/exp/tts_stats_raw_phn_none/train/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..b502486514a88b6911fab9d7439287625ebea56f --- /dev/null +++ b/exp/tts_stats_raw_phn_none/train/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a6eba8eff7e21e0fd3549ec9b9f4ee1ab871dca3b9f72c1f864e0ce4d742ea3 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/train/speech_shape b/exp/tts_stats_raw_phn_none/train/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..4550190b67ac8671f38f92a9deac93eeed67ae02 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/train/speech_shape @@ -0,0 +1,1381 @@ +1 203776 +100 121856 +101 111872 +103 189440 +104 164096 +105 154112 +106 191488 +108 141312 +109 134144 +110 166400 +111 129280 +112 166912 +113 152064 +114 184064 +116 213760 +117 236191 +118 216396 +124 206592 +127 188416 +13 109568 +132 182784 +133 269824 +135 141056 +136 129792 +137 132864 +139 240128 +14 220672 +141 142848 +142 137728 +143 238336 +144 133888 +145 141824 +14502 134144 +14507 194048 +14520 178176 +14522 190976 +14523 108544 +14532 187904 +14538 207616 +14540 168448 +14551 220160 +146 106496 +14602 206592 +14612 174336 +14626 178432 +14630 170240 +14631 216320 +14632 282880 +14636 140288 +14643 97792 +147 88064 +14731 255232 +14732 157440 +14733 112896 +14736 131584 +14737 178688 +14739 207872 +14757 292096 +14761 113408 +14763 177408 +14764 191744 +14780 159232 +14781 178176 +14783 182272 +14784 171264 +14786 187904 +14799 185344 +148 171776 +14802 253952 +14813 216576 +14818 191488 +14820 159744 +14822 183040 +14840 229506 +14841 137472 +14845 228608 +14856 261632 +14858 217600 +14865 139520 +14876 166912 +14878 188672 +14879 174080 +14880 295936 +14881 187648 +14882 217088 +14883 186880 +14890 179712 +14892 278272 +14896 139520 +149 180736 +14902 248064 +14906 204288 +14910 168704 +14916 253184 +14919 151808 +14920 129024 +14924 136960 +14931 231424 +14932 218880 +14933 211712 +14934 252416 +14938 247552 +14942 247296 +14943 226560 +14944 195584 +14950 221696 +14951 237568 +14954 147456 +14955 179200 +14956 251136 +14957 209920 +14958 315392 +14959 230144 +14960 162816 +14961 179712 +14962 145664 +14963 256256 +14964 267008 +14966 189440 +14967 109056 +14973 165888 +14983 187136 +14986 157440 +14988 156160 +14989 165888 +14990 159232 +14992 146176 +14997 151296 +15 122880 +15000 158720 +15001 121600 +15002 111616 +15004 172544 +15006 129536 +15010 164864 +15016 192000 +15018 284672 +15019 149760 +15020 148480 +15022 182272 +15025 162816 +15026 211200 +15034 165888 +15042 139264 +15043 261632 +15044 260096 +15086 250624 +151 105472 +15195 192256 +15202 231936 +15207 218368 +15212 303872 +15221 144128 +15225 192000 +15268 166912 +15284 92416 +15285 113920 +153 137472 +15322 160768 +15483 171264 +155 109568 +15501 113152 +15518 118528 +15520 171008 +156 147456 +15637 143616 +15638 171776 +15640 176640 +15658 205568 +15687 157696 +157 135168 +15703 159488 +15706 132096 +15707 142592 +15720 115200 +15721 234752 +15763 190720 +158 149504 +15805 211456 +15834 113152 +15861 143360 +15897 109824 +159 99584 +15921 118272 +160 88064 +161 149760 +16124 169984 +16132 163328 +16136 108032 +16202 118016 +16296 92416 +16297 162560 +163 99072 +16324 128512 +16345 138240 +16346 248064 +16347 281344 +16348 169472 +16349 210176 +16350 190464 +16351 202752 +16368 234752 +164 113408 +16407 133888 +16412 120832 +16413 309504 +16414 208128 +166 128000 +167 138496 +16765 126464 +17 132096 +172 113152 +173 103424 +175 123136 +176 140032 +177 177408 +178 199680 +179 167424 +18 94720 +181 151040 +18132 170496 +18133 329216 +18134 329472 +18135 248064 +18136 332032 +18137 313600 +18149 209664 +18150 170496 +18151 179968 +18155 338944 +18156 183808 +18157 211968 +18158 239872 +18159 268288 +18160 268032 +18161 314368 +18162 183040 +18163 178688 +18164 183040 +18166 224256 +18168 289280 +18169 250368 +18171 336640 +18172 242176 +18176 283392 +18177 159488 +18178 119296 +18180 192768 +18181 359424 +18184 237824 +18185 197120 +18187 249600 +18190 268288 +18194 195840 +182 208640 +18215 118528 +18216 195584 +18219 192256 +18220 226560 +18223 165120 +18224 173824 +18225 196864 +18227 188928 +18230 229888 +18231 136448 +18232 259328 +18233 190976 +18234 142592 +18235 116992 +18238 106496 +18239 132608 +18240 202496 +18242 155904 +18244 132096 +18247 98560 +18252 175616 +18296 139008 +183 171776 +18307 166144 +18308 124160 +18326 139264 +18337 154624 +18356 142080 +18375 122880 +18377 123392 +18378 110848 +18380 182528 +18387 153856 +18392 163840 +18394 206848 +184 110592 +18402 150016 +18406 163072 +18410 98816 +18415 148480 +18418 177408 +18423 130560 +18427 217088 +18432 166912 +18438 151040 +18446 157696 +18455 120832 +18458 173312 +18470 158464 +18471 125952 +18475 219648 +18482 144128 +18483 169984 +18497 119552 +18498 128512 +185 209152 +18503 192000 +18509 153856 +18512 188672 +18535 187648 +18538 127488 +18541 132352 +18543 120320 +18546 175616 +18548 139264 +18552 187648 +18553 159488 +18559 151296 +18562 115200 +18564 153088 +18567 193536 +18568 139520 +18569 176640 +18570 135424 +18571 204288 +18578 133376 +18579 118528 +18580 96512 +18582 186624 +18584 151552 +18590 177664 +186 178432 +18601 130560 +18606 187904 +18610 115712 +18624 213760 +18639 189184 +18641 125440 +18642 175104 +18646 124672 +18655 138240 +18658 138240 +18670 125952 +18680 197120 +18681 195328 +18686 142336 +187 171264 +18706 269824 +18707 189696 +18709 145152 +18711 114688 +18720 132352 +18747 200192 +18755 96256 +18758 217344 +18760 128000 +18768 193024 +18769 105728 +18772 164608 +18786 253952 +18796 185269 +18804 201984 +18813 178176 +18835 184832 +18837 192512 +18838 208384 +18841 211968 +18851 199936 +18858 146688 +18861 149504 +18862 215296 +18867 199168 +18870 221952 +18873 191744 +18886 203264 +18893 230144 +18899 134912 +189 160768 +18904 183808 +18906 147712 +18908 136448 +18909 156160 +18910 211968 +18912 167680 +18915 148480 +18919 197888 +18934 191232 +18935 142336 +18936 141568 +18943 175360 +18944 173824 +18947 190208 +18951 154368 +18955 233216 +18959 226560 +18964 163584 +18982 113664 +18989 163072 +18991 212480 +18993 175872 +18997 101888 +19 122880 +19001 217088 +19005 184832 +19010 156928 +19011 175872 +19015 139520 +19024 165888 +19028 158720 +19063 187136 +19065 144128 +19067 175616 +19075 163584 +19076 214784 +19090 172544 +19091 199936 +19095 118016 +19096 165888 +19099 159488 +191 134144 +19103 124416 +19109 132352 +19111 151740 +19113 129280 +19116 155648 +19118 174336 +19121 137472 +19122 144896 +19132 131072 +19138 135936 +19141 129024 +19142 114944 +19157 133888 +19160 140032 +19163 215040 +19165 201216 +19177 184832 +19180 195584 +19181 237568 +19194 147968 +19197 144128 +192 141568 +19201 260608 +19211 170240 +19212 206080 +19213 167168 +19218 185856 +19224 159488 +19225 113920 +19229 149248 +19234 193792 +19237 164352 +19241 160512 +19251 180736 +19263 96000 +19267 141824 +19271 105216 +19276 142336 +19280 153344 +19281 169728 +19295 168448 +19298 119808 +19304 154624 +19310 141056 +19316 146944 +19321 209974 +19325 133120 +19327 170240 +19333 218624 +19337 188928 +19347 234240 +19348 214016 +19357 189440 +19360 171776 +19366 143104 +19367 199936 +19371 145920 +19372 162816 +19374 145664 +19376 201682 +19387 219904 +19396 130048 +19399 112896 +194 140032 +19400 183808 +19404 159488 +19406 186624 +19410 183552 +19413 121088 +19414 134912 +19423 198400 +19429 195328 +19439 114944 +19440 97280 +19449 159488 +19451 140032 +19454 120320 +19477 191488 +19482 157696 +19488 169472 +19496 129792 +19499 153344 +195 122624 +19501 137216 +19506 162816 +19509 143872 +19510 119040 +19511 146688 +19521 132864 +19522 167680 +19524 146944 +19529 188928 +19540 193536 +19542 179456 +19543 159669 +19548 138752 +19550 167680 +19551 146176 +19554 196096 +19556 163840 +19558 221952 +19565 262144 +19569 183552 +19576 125952 +19581 146406 +19584 137216 +19585 116480 +19587 133376 +19590 167424 +19595 171520 +19598 146944 +196 125696 +19601 233984 +19604 182784 +19605 187648 +19608 147200 +19611 169472 +19612 229120 +19621 151296 +19622 215552 +19627 184064 +19631 179456 +19635 172800 +19638 178595 +19651 165632 +19658 211712 +19659 147200 +19673 156851 +19676 174080 +19683 141568 +19684 172800 +19688 179200 +19692 149504 +19695 130048 +197 130816 +19700 178688 +19702 179200 +19703 175872 +19705 176896 +19713 150272 +19714 196352 +19723 196864 +19726 153856 +19748 110336 +19749 249088 +19753 98048 +19760 154624 +19763 157696 +19764 145152 +19775 123648 +19778 92416 +19782 203776 +19783 274944 +19785 126464 +19787 186368 +19789 132864 +19791 174592 +19797 141056 +19798 217344 +198 150528 +19801 117248 +19812 144640 +19826 137472 +19851 181423 +19854 124928 +19855 181760 +19864 187904 +19874 183296 +19875 131328 +19877 162048 +19884 145920 +19888 195584 +19889 176640 +199 187648 +19902 143104 +19903 194304 +19907 250112 +19913 130304 +19917 127232 +19918 177920 +19920 218368 +19926 236032 +19928 141056 +19931 133376 +19935 203520 +19938 102144 +19944 126464 +19946 116992 +19947 154112 +19948 171637 +19949 141056 +19951 214272 +19952 165376 +19955 134912 +19957 150596 +19959 176896 +19976 169472 +19979 119808 +19981 134144 +19984 171520 +19990 235008 +19998 195840 +200 125440 +20001 184576 +20005 108032 +20020 164608 +20022 235264 +20029 174080 +20038 216576 +20042 241920 +20051 203776 +20055 168448 +20062 152064 +20080 219136 +20087 116992 +20095 193792 +201 119040 +20109 167424 +20119 149760 +20120 154368 +20121 172288 +20128 143872 +20144 112128 +20147 167168 +20183 139520 +20185 157184 +202 162816 +20205 131840 +20208 215040 +20216 119296 +20219 147456 +20224 140544 +20231 194560 +20232 172288 +20234 163584 +20236 236032 +20237 199680 +20239 239872 +20251 163072 +20255 141312 +20256 130048 +20257 214782 +20268 140800 +20273 171264 +20278 107776 +20279 183296 +20284 203132 +20291 166400 +20293 137984 +20295 176128 +20297 169216 +20298 195840 +20303 160256 +20304 131584 +20305 169984 +20309 139008 +20311 106240 +20315 152832 +20324 189952 +20327 155904 +20332 162560 +20339 100096 +20346 145664 +20356 150016 +20360 146176 +20371 230912 +20390 118528 +20395 98048 +20396 101120 +20397 145920 +20399 184998 +204 172800 +20402 146688 +20406 152832 +20408 90112 +20410 104448 +20413 85248 +20422 138752 +20427 174848 +20435 162816 +20438 158208 +20440 123648 +20442 184715 +20445 159232 +20447 106496 +20461 272896 +20464 187904 +20465 173056 +20482 133376 +20484 178944 +20488 192768 +20489 157952 +20495 154880 +20496 141056 +205 124672 +20503 169216 +20504 202496 +20512 149248 +20513 167936 +20516 112128 +20517 131328 +20520 216320 +20523 153088 +20524 145152 +20532 243200 +20535 122880 +20547 169472 +20549 176640 +20554 97792 +20555 127100 +20558 230912 +20561 246272 +20575 118784 +20576 142592 +20582 173056 +20584 170752 +20587 176384 +20589 164864 +20597 161792 +206 169472 +20604 192000 +20605 161280 +20606 153600 +20607 131072 +20616 141568 +20617 170752 +20621 168704 +20622 169984 +20629 164096 +20633 119552 +20635 141312 +20643 150528 +20644 158720 +20645 146944 +20663 215296 +20665 101376 +20668 179200 +20673 140288 +20674 123904 +20675 126464 +20679 157184 +20698 146176 +20699 156672 +207 188672 +20702 198144 +20708 192256 +20709 163840 +20710 169984 +20714 189696 +20729 136448 +20739 119040 +20740 171776 +20751 106496 +20754 185856 +20755 140544 +208 235776 +209 127744 +20913 151040 +20914 249344 +20923 160000 +20926 179200 +20928 190464 +20935 124928 +20954 287744 +20955 193792 +20959 159232 +20961 192256 +20982 189696 +20988 114688 +20992 158720 +21002 201984 +21005 273152 +21017 197376 +21020 150272 +21022 200704 +21028 141568 +21030 172544 +21033 156160 +21038 102400 +21043 175616 +21045 93952 +21055 187648 +21061 178944 +21072 152320 +21075 166656 +21082 130304 +21084 162304 +21085 154368 +21088 207104 +21097 187392 +21098 152064 +211 152320 +21100 204032 +21101 217600 +21103 156160 +21107 138240 +21109 171264 +21113 141312 +21115 124928 +21129 194816 +21139 219392 +21143 148736 +21144 197888 +21147 139520 +21148 147200 +21158 156160 +21165 200448 +21170 154880 +21175 182528 +21176 139264 +21178 123904 +21192 178176 +21193 121088 +21199 142592 +212 126976 +21201 158208 +21210 144896 +21214 154624 +21228 154624 +21235 212224 +21239 207616 +21248 198656 +21252 178432 +21266 185344 +21269 163328 +21270 161536 +21271 159488 +21282 216320 +21287 149760 +21290 114944 +213 157952 +21307 129024 +21308 159744 +21309 170752 +21310 182528 +21312 175104 +21348 143616 +21349 126208 +21362 223232 +21363 226048 +21366 185088 +21368 133632 +21372 193792 +21375 141312 +21379 148992 +21382 143104 +21385 175616 +21388 133632 +21397 121600 +214 112640 +21401 217600 +21402 162304 +21403 138240 +21411 150272 +21412 185856 +21414 187904 +21420 177076 +21422 159232 +21424 177920 +21425 131840 +21433 194048 +21447 176384 +21449 159232 +21475 168192 +21479 169216 +21481 91136 +21492 144640 +21498 145152 +21500 178432 +21502 176384 +21503 204288 +21505 161024 +21511 140800 +21515 246528 +21517 182784 +21521 139776 +21525 198400 +21526 122368 +21536 110592 +21539 181248 +21540 103936 +21544 144640 +21546 135424 +21547 128000 +21560 148992 +21567 214016 +21569 124160 +21571 101632 +21575 159232 +21584 103680 +21590 137728 +21591 188160 +21592 148480 +21596 135424 +21597 84480 +21598 126464 +21604 147712 +21605 170240 +21609 121600 +21623 173568 +21624 192256 +21675 184320 +21726 114432 +21740 138496 +21744 138752 +218 152576 +219 92672 +22 86272 +220 132608 +222 171264 +223 130048 +227 129280 +229 115712 +231 162048 +233 187392 +234 147968 +235 129024 +236 127744 +238 223488 +240 142848 +241 139520 +242 107520 +243 128256 +244 127232 +246 161792 +249 152576 +250 121344 +251 135424 +253 174961 +254 102144 +257 132608 +258 124672 +259 163840 +260 139520 +261 186624 +264 122368 +265 149248 +266 196608 +268 136192 +269 129280 +271 190720 +272 155392 +274 150528 +275 127232 +277 182528 +278 150784 +281 183808 +282 179968 +283 108800 +284 171520 +288 147968 +289 194304 +290 175360 +293 119296 +294 137728 +295 127232 +297 122368 +299 124160 +3 109568 +300 137984 +301 214784 +303 130890 +304 142592 +305 98816 +307 141824 +308 96256 +309 103168 +310 141568 +311 113152 +312 97536 +313 118272 +314 121600 +315 76288 +317 103680 +318 192512 +32 179456 +320 138496 +322 128512 +323 98560 +325 92672 +328 181248 +33 104192 +331 155136 +333 104192 +336 106240 +337 90368 +338 119040 +339 134144 +34 130048 +340 152320 +341 146688 +342 170752 +345 151552 +346 171008 +348 155904 +35 98816 +351 189696 +354 119296 +356 117248 +357 128000 +358 109824 +36 119296 +360 154368 +361 93184 +362 145664 +363 104448 +364 107264 +365 171520 +366 170240 +367 106752 +37 124416 +370 137216 +372 126208 +373 199936 +374 185491 +375 111104 +376 116992 +377 108544 +378 177920 +379 142080 +38 162560 +381 148992 +382 141056 +384 130304 +386 175360 +388 113152 +389 152320 +392 229120 +393 160512 +394 109824 +396 102144 +398 145664 +399 184064 +40 133376 +400 158464 +401 131072 +402 180480 +404 114688 +405 123648 +406 118784 +407 137472 +41 107776 +411 122624 +413 133632 +414 106752 +415 123904 +416 139008 +42 125184 +421 122880 +422 104960 +424 165376 +425 203008 +426 147968 +427 99584 +429 162816 +43 116736 +430 135168 +431 149760 +432 164864 +433 120064 +434 152320 +436 154112 +438 195014 +440 197888 +441 136960 +442 163840 +443 112640 +445 108544 +446 230656 +448 108800 +449 125184 +45 100352 +450 163584 +452 155392 +453 103680 +454 130304 +459 158976 +46 183296 +460 185344 +461 117760 +462 108288 +463 112896 +465 129024 +466 126208 +467 76800 +469 143104 +470 99328 +472 122880 +473 101632 +474 125952 +475 145152 +476 189440 +479 110592 +48 102716 +480 141568 +481 149811 +482 110848 +483 123904 +484 132096 +485 112384 +487 124928 +488 95744 +489 156416 +49 84736 +490 97984 +492 102144 +493 108544 +495 136448 +496 118272 +497 128256 +499 120064 +5 148480 +50 103168 +500 158720 +501 98304 +502 83200 +503 74752 +505 97792 +506 143872 +507 128768 +509 147456 +51 110848 +510 136192 +513 118272 +515 130048 +518 161024 +519 126464 +52 144128 +522 123648 +523 75264 +524 146688 +525 135680 +528 182016 +530 135424 +534 90624 +535 123392 +536 87552 +537 124672 +552 166400 +557 220928 +558 126464 +561 152576 +562 170240 +564 99584 +566 116480 +567 155648 +569 154112 +570 160512 +571 131840 +572 134656 +574 103680 +577 155904 +578 153600 +579 121088 +58 78592 +580 127744 +583 101120 +584 84992 +585 95744 +586 125952 +587 127488 +590 104960 +591 159488 +592 163328 +593 93696 +595 111872 +596 136704 +598 304384 +599 235776 +6 184064 +60 156416 +601 145920 +602 115456 +603 163328 +604 124160 +605 162560 +606 182528 +607 109056 +608 96768 +609 130560 +61 83456 +610 152576 +611 196352 +613 120320 +614 166656 +617 216064 +619 133632 +62 68608 +620 109824 +621 99840 +622 168704 +623 113920 +624 139264 +625 166912 +626 209920 +627 139008 +629 263936 +63 150784 +630 91136 +631 120320 +632 123392 +633 121344 +636 141056 +639 148480 +640 166400 +641 117248 +643 189519 +644 131201 +645 107776 +646 128512 +647 120832 +648 108544 +649 278016 +65 85504 +650 128768 +651 170752 +655 132864 +656 122880 +658 110152 +659 111872 +660 123136 +661 123392 +665 112384 +666 128220 +667 150784 +668 153856 +669 99584 +670 102400 +671 144896 +674 107934 +675 88320 +676 201216 +677 138496 +679 290304 +68 100352 +680 126976 +681 160000 +684 210322 +685 214016 +686 147200 +688 137472 +69 155646 +690 203264 +691 105216 +693 146944 +694 154880 +696 105728 +697 221184 +699 102656 +70 93184 +701 231680 +702 124416 +706 136704 +707 189952 +708 242432 +71 129024 +711 87120 +712 94976 +713 125952 +714 102400 +715 157696 +716 165888 +717 97024 +718 145408 +719 218624 +72 77824 +720 129792 +721 153600 +723 199424 +724 122624 +725 102400 +726 185600 +727 144128 +728 163072 +729 104192 +730 130816 +731 142336 +732 117504 +733 168448 +735 151040 +736 105728 +74 126208 +740 183040 +741 162048 +744 190720 +746 148736 +747 119040 +749 137984 +75 178688 +750 131328 +751 129024 +752 191488 +753 135424 +754 141568 +755 146176 +756 113152 +757 180992 +758 147200 +759 120832 +76 168960 +761 155904 +762 92160 +763 199680 +765 89600 +766 225280 +767 125368 +768 195072 +769 167936 +77 158720 +771 152064 +772 115200 +773 120064 +774 162304 +775 110336 +776 188416 +777 189184 +78 114176 +780 107008 +781 101888 +782 164864 +783 88064 +784 98816 +785 141824 +786 167936 +787 119296 +788 137472 +79 175360 +790 93440 +791 87552 +792 108800 +793 71168 +795 145408 +798 110592 +799 145152 +8 216576 +80 130542 +802 107008 +803 144896 +804 123904 +805 97792 +810 97024 +811 139008 +812 140800 +813 79360 +82 206848 +83 160000 +86 199936 +87 167680 +90 195840 +91 133888 +93 151552 +94 193280 +95 133120 +96 152320 +98 129792 +99 190976 diff --git a/exp/tts_stats_raw_phn_none/train/text_shape b/exp/tts_stats_raw_phn_none/train/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..87ec0235a24683cf551cd94867440fe643fd19d9 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/train/text_shape @@ -0,0 +1,1381 @@ +1 93 +100 71 +101 54 +103 96 +104 82 +105 66 +106 98 +108 74 +109 70 +110 74 +111 64 +112 85 +113 64 +114 92 +116 114 +117 128 +118 106 +124 119 +127 89 +13 49 +132 102 +133 142 +135 74 +136 58 +137 60 +139 139 +14 109 +141 59 +142 68 +143 135 +144 65 +145 77 +14502 79 +14507 94 +14520 80 +14522 94 +14523 48 +14532 84 +14538 101 +14540 74 +14551 116 +146 54 +14602 114 +14612 73 +14626 82 +14630 79 +14631 104 +14632 133 +14636 71 +14643 47 +147 48 +14731 121 +14732 76 +14733 64 +14736 57 +14737 95 +14739 102 +14757 134 +14761 54 +14763 105 +14764 103 +14780 89 +14781 98 +14783 88 +14784 84 +14786 101 +14799 102 +148 83 +14802 137 +14813 97 +14818 112 +14820 79 +14822 105 +14840 130 +14841 81 +14845 142 +14856 132 +14858 107 +14865 64 +14876 81 +14878 96 +14879 96 +14880 156 +14881 110 +14882 132 +14883 97 +14890 100 +14892 178 +14896 71 +149 99 +14902 130 +14906 113 +14910 72 +14916 140 +14919 82 +14920 67 +14924 80 +14931 137 +14932 109 +14933 107 +14934 149 +14938 144 +14942 127 +14943 124 +14944 122 +14950 126 +14951 142 +14954 77 +14955 103 +14956 144 +14957 113 +14958 171 +14959 123 +14960 88 +14961 95 +14962 82 +14963 132 +14964 137 +14966 89 +14967 58 +14973 96 +14983 102 +14986 91 +14988 79 +14989 99 +14990 91 +14992 78 +14997 73 +15 67 +15000 93 +15001 55 +15002 50 +15004 75 +15006 58 +15010 86 +15016 114 +15018 146 +15019 74 +15020 72 +15022 89 +15025 87 +15026 114 +15034 84 +15042 66 +15043 128 +15044 139 +15086 129 +151 45 +15195 117 +15202 120 +15207 113 +15212 179 +15221 82 +15225 102 +15268 85 +15284 48 +15285 57 +153 62 +15322 87 +15483 90 +155 54 +15501 58 +15518 60 +15520 92 +156 76 +15637 82 +15638 105 +15640 99 +15658 116 +15687 85 +157 73 +15703 98 +15706 75 +15707 75 +15720 46 +15721 134 +15763 112 +158 75 +15805 108 +15834 59 +15861 74 +15897 47 +159 54 +15921 43 +160 49 +161 72 +16124 98 +16132 91 +16136 55 +16202 51 +16296 46 +16297 86 +163 46 +16324 64 +16345 77 +16346 152 +16347 162 +16348 94 +16349 116 +16350 97 +16351 102 +16368 114 +164 62 +16407 68 +16412 60 +16413 208 +16414 129 +166 64 +167 73 +16765 64 +17 66 +172 62 +173 54 +175 66 +176 76 +177 102 +178 132 +179 103 +18 52 +181 86 +18132 92 +18133 175 +18134 194 +18135 140 +18136 191 +18137 172 +18149 120 +18150 88 +18151 104 +18155 182 +18156 90 +18157 107 +18158 122 +18159 147 +18160 152 +18161 169 +18162 98 +18163 106 +18164 112 +18166 136 +18168 156 +18169 163 +18171 204 +18172 122 +18176 155 +18177 102 +18178 77 +18180 106 +18181 203 +18184 122 +18185 111 +18187 147 +18190 148 +18194 116 +182 141 +18215 45 +18216 103 +18219 101 +18220 116 +18223 76 +18224 83 +18225 101 +18227 94 +18230 118 +18231 67 +18232 158 +18233 95 +18234 69 +18235 66 +18238 48 +18239 66 +18240 98 +18242 74 +18244 64 +18247 43 +18252 95 +18296 65 +183 89 +18307 79 +18308 67 +18326 73 +18337 75 +18356 58 +18375 57 +18377 66 +18378 57 +18380 107 +18387 79 +18392 95 +18394 121 +184 53 +18402 90 +18406 100 +18410 49 +18415 68 +18418 99 +18423 68 +18427 123 +18432 101 +18438 71 +18446 69 +18455 66 +18458 103 +18470 80 +18471 61 +18475 118 +18482 73 +18483 95 +18497 60 +18498 52 +185 120 +18503 101 +18509 81 +18512 104 +18535 91 +18538 62 +18541 34 +18543 62 +18546 89 +18548 66 +18552 104 +18553 68 +18559 82 +18562 40 +18564 79 +18567 94 +18568 65 +18569 93 +18570 58 +18571 103 +18578 72 +18579 65 +18580 45 +18582 108 +18584 67 +18590 92 +186 83 +18601 73 +18606 107 +18610 52 +18624 121 +18639 114 +18641 62 +18642 97 +18646 67 +18655 64 +18658 80 +18670 62 +18680 117 +18681 117 +18686 77 +187 100 +18706 166 +18707 107 +18709 81 +18711 68 +18720 65 +18747 109 +18755 49 +18758 106 +18760 67 +18768 119 +18769 61 +18772 72 +18786 104 +18796 68 +18804 66 +18813 70 +18835 112 +18837 104 +18838 122 +18841 111 +18851 113 +18858 103 +18861 67 +18862 120 +18867 113 +18870 138 +18873 103 +18886 121 +18893 119 +18899 86 +189 92 +18904 90 +18906 82 +18908 59 +18909 76 +18910 100 +18912 87 +18915 72 +18919 107 +18934 103 +18935 66 +18936 77 +18943 94 +18944 90 +18947 93 +18951 66 +18955 116 +18959 120 +18964 81 +18982 54 +18989 85 +18991 114 +18993 100 +18997 45 +19 58 +19001 132 +19005 97 +19010 82 +19011 97 +19015 72 +19024 90 +19028 71 +19063 115 +19065 84 +19067 83 +19075 78 +19076 112 +19090 92 +19091 108 +19095 62 +19096 89 +19099 87 +191 70 +19103 68 +19109 75 +19111 80 +19113 45 +19116 87 +19118 97 +19121 74 +19122 87 +19132 69 +19138 75 +19141 52 +19142 56 +19157 74 +19160 74 +19163 128 +19165 120 +19177 99 +19180 111 +19181 144 +19194 89 +19197 89 +192 62 +19201 155 +19211 101 +19212 131 +19213 99 +19218 108 +19224 90 +19225 59 +19229 64 +19234 96 +19237 90 +19241 95 +19251 97 +19263 45 +19267 73 +19271 54 +19276 76 +19280 83 +19281 104 +19295 87 +19298 53 +19304 86 +19310 67 +19316 68 +19321 109 +19325 80 +19327 93 +19333 107 +19337 117 +19347 118 +19348 113 +19357 106 +19360 89 +19366 85 +19367 92 +19371 64 +19372 84 +19374 75 +19376 103 +19387 108 +19396 57 +19399 54 +194 62 +19400 92 +19404 99 +19406 103 +19410 102 +19413 67 +19414 60 +19423 90 +19429 90 +19439 51 +19440 33 +19449 92 +19451 67 +19454 61 +19477 97 +19482 84 +19488 93 +19496 63 +19499 65 +195 68 +19501 60 +19506 96 +19509 80 +19510 57 +19511 77 +19521 61 +19522 80 +19524 67 +19529 91 +19540 101 +19542 107 +19543 78 +19548 61 +19550 88 +19551 85 +19554 119 +19556 95 +19558 140 +19565 149 +19569 89 +19576 70 +19581 69 +19584 61 +19585 48 +19587 54 +19590 84 +19595 99 +19598 60 +196 72 +19601 117 +19604 90 +19605 92 +19608 79 +19611 96 +19612 121 +19621 90 +19622 117 +19627 113 +19631 87 +19635 111 +19638 77 +19651 80 +19658 100 +19659 71 +19673 80 +19676 101 +19683 66 +19684 111 +19688 88 +19692 90 +19695 59 +197 61 +19700 81 +19702 78 +19703 92 +19705 94 +19713 85 +19714 108 +19723 90 +19726 79 +19748 46 +19749 113 +19753 46 +19760 80 +19763 89 +19764 84 +19775 55 +19778 52 +19782 107 +19783 151 +19785 59 +19787 88 +19789 69 +19791 90 +19797 74 +19798 121 +198 84 +19801 58 +19812 82 +19826 60 +19851 104 +19854 56 +19855 91 +19864 90 +19874 95 +19875 72 +19877 85 +19884 72 +19888 104 +19889 107 +199 85 +19902 66 +19903 116 +19907 116 +19913 74 +19917 63 +19918 106 +19920 108 +19926 122 +19928 71 +19931 84 +19935 117 +19938 58 +19944 71 +19946 52 +19947 71 +19948 80 +19949 64 +19951 133 +19952 102 +19955 64 +19957 81 +19959 106 +19976 97 +19979 66 +19981 66 +19984 80 +19990 127 +19998 100 +200 64 +20001 98 +20005 61 +20020 68 +20022 143 +20029 103 +20038 123 +20042 136 +20051 106 +20055 97 +20062 90 +20080 124 +20087 52 +20095 101 +201 67 +20109 82 +20119 64 +20120 93 +20121 83 +20128 67 +20144 69 +20147 78 +20183 70 +20185 80 +202 95 +20205 63 +20208 118 +20216 63 +20219 79 +20224 63 +20231 93 +20232 109 +20234 94 +20236 128 +20237 103 +20239 137 +20251 83 +20255 65 +20256 60 +20257 119 +20268 85 +20273 91 +20278 57 +20279 107 +20284 115 +20291 84 +20293 67 +20295 104 +20297 87 +20298 102 +20303 72 +20304 55 +20305 61 +20309 77 +20311 57 +20315 79 +20324 98 +20327 84 +20332 98 +20339 52 +20346 70 +20356 96 +20360 80 +20371 100 +20390 63 +20395 44 +20396 44 +20397 69 +20399 103 +204 84 +20402 82 +20406 77 +20408 39 +20410 53 +20413 49 +20422 77 +20427 97 +20435 101 +20438 97 +20440 56 +20442 108 +20445 89 +20447 58 +20461 156 +20464 94 +20465 87 +20482 62 +20484 86 +20488 115 +20489 86 +20495 71 +20496 84 +205 71 +20503 97 +20504 97 +20512 86 +20513 75 +20516 48 +20517 56 +20520 113 +20523 67 +20524 72 +20532 124 +20535 57 +20547 73 +20549 75 +20554 48 +20555 72 +20558 114 +20561 140 +20575 44 +20576 69 +20582 96 +20584 88 +20587 79 +20589 83 +20597 87 +206 90 +20604 103 +20605 88 +20606 86 +20607 77 +20616 68 +20617 89 +20621 96 +20622 89 +20629 99 +20633 49 +20635 66 +20643 90 +20644 92 +20645 72 +20663 130 +20665 45 +20668 93 +20673 69 +20674 72 +20675 73 +20679 86 +20698 74 +20699 84 +207 98 +20702 120 +20708 101 +20709 78 +20710 93 +20714 97 +20729 80 +20739 56 +20740 98 +20751 48 +20754 116 +20755 80 +208 112 +209 74 +20913 76 +20914 130 +20923 74 +20926 106 +20928 77 +20935 60 +20954 145 +20955 104 +20959 85 +20961 83 +20982 100 +20988 67 +20992 87 +21002 120 +21005 148 +21017 105 +21020 68 +21022 123 +21028 65 +21030 88 +21033 79 +21038 54 +21043 100 +21045 48 +21055 111 +21061 96 +21072 81 +21075 86 +21082 72 +21084 73 +21085 76 +21088 103 +21097 96 +21098 82 +211 70 +21100 126 +21101 145 +21103 78 +21107 74 +21109 85 +21113 72 +21115 63 +21129 98 +21139 100 +21143 85 +21144 93 +21147 74 +21148 65 +21158 76 +21165 110 +21170 83 +21175 92 +21176 62 +21178 66 +21192 93 +21193 63 +21199 65 +212 68 +21201 89 +21210 87 +21214 89 +21228 83 +21235 118 +21239 121 +21248 90 +21252 115 +21266 95 +21269 103 +21270 88 +21271 90 +21282 129 +21287 63 +21290 50 +213 82 +21307 82 +21308 101 +21309 97 +21310 105 +21312 93 +21348 74 +21349 56 +21362 114 +21363 124 +21366 85 +21368 40 +21372 64 +21375 64 +21379 76 +21382 74 +21385 82 +21388 74 +21397 62 +214 52 +21401 121 +21402 89 +21403 68 +21411 70 +21412 97 +21414 97 +21420 84 +21422 69 +21424 106 +21425 77 +21433 104 +21447 86 +21449 91 +21475 97 +21479 101 +21481 39 +21492 73 +21498 79 +21500 82 +21502 90 +21503 100 +21505 83 +21511 58 +21515 127 +21517 82 +21521 74 +21525 107 +21526 61 +21536 54 +21539 96 +21540 45 +21544 75 +21546 46 +21547 58 +21560 65 +21567 122 +21569 68 +21571 55 +21575 77 +21584 60 +21590 66 +21591 109 +21592 67 +21596 60 +21597 39 +21598 49 +21604 74 +21605 84 +21609 49 +21623 78 +21624 100 +21675 91 +21726 61 +21740 67 +21744 65 +218 84 +219 51 +22 49 +220 72 +222 85 +223 66 +227 68 +229 57 +231 72 +233 94 +234 82 +235 70 +236 66 +238 122 +240 87 +241 70 +242 58 +243 56 +244 64 +246 82 +249 81 +250 62 +251 68 +253 97 +254 50 +257 69 +258 65 +259 86 +260 70 +261 99 +264 64 +265 82 +266 105 +268 73 +269 65 +271 108 +272 81 +274 81 +275 65 +277 96 +278 72 +281 100 +282 92 +283 57 +284 92 +288 83 +289 109 +290 88 +293 61 +294 60 +295 70 +297 60 +299 70 +3 49 +300 66 +301 122 +303 65 +304 71 +305 54 +307 80 +308 48 +309 54 +310 72 +311 54 +312 50 +313 66 +314 58 +315 34 +317 52 +318 101 +32 88 +320 72 +322 68 +323 51 +325 41 +328 85 +33 45 +331 88 +333 52 +336 54 +337 54 +338 69 +339 78 +34 62 +340 76 +341 88 +342 92 +345 82 +346 94 +348 70 +35 52 +351 120 +354 60 +356 66 +357 71 +358 58 +36 59 +360 76 +361 54 +362 84 +363 48 +364 63 +365 82 +366 97 +367 65 +37 50 +370 76 +372 72 +373 126 +374 110 +375 66 +376 66 +377 69 +378 108 +379 77 +38 73 +381 86 +382 82 +384 69 +386 107 +388 67 +389 67 +392 127 +393 78 +394 57 +396 45 +398 71 +399 89 +40 62 +400 82 +401 65 +402 94 +404 51 +405 54 +406 64 +407 72 +41 53 +411 74 +413 72 +414 48 +415 60 +416 79 +42 64 +421 64 +422 60 +424 86 +425 97 +426 73 +427 51 +429 84 +43 61 +430 68 +431 89 +432 89 +433 58 +434 78 +436 82 +438 103 +440 98 +441 72 +442 77 +443 55 +445 52 +446 104 +448 54 +449 69 +45 43 +450 82 +452 84 +453 50 +454 71 +459 82 +46 87 +460 100 +461 53 +462 46 +463 79 +465 79 +466 70 +467 43 +469 68 +470 60 +472 68 +473 50 +474 74 +475 81 +476 96 +479 54 +48 53 +480 85 +481 72 +482 57 +483 50 +484 76 +485 64 +487 69 +488 50 +489 81 +49 36 +490 52 +492 60 +493 66 +495 58 +496 66 +497 56 +499 68 +5 65 +50 47 +500 85 +501 61 +502 46 +503 35 +505 54 +506 86 +507 71 +509 82 +51 50 +510 67 +513 57 +515 61 +518 80 +519 60 +52 70 +522 71 +523 46 +524 92 +525 66 +528 93 +530 66 +534 47 +535 57 +536 55 +537 65 +552 91 +557 103 +558 67 +561 78 +562 98 +564 50 +566 56 +567 82 +569 68 +570 77 +571 64 +572 67 +574 53 +577 73 +578 75 +579 64 +58 47 +580 72 +583 65 +584 52 +585 55 +586 65 +587 72 +590 44 +591 84 +592 78 +593 49 +595 54 +596 77 +598 139 +599 132 +6 98 +60 89 +601 74 +602 56 +603 97 +604 65 +605 88 +606 101 +607 61 +608 53 +609 67 +61 36 +610 100 +611 125 +613 64 +614 95 +617 108 +619 59 +62 38 +620 55 +621 53 +622 87 +623 50 +624 75 +625 86 +626 102 +627 64 +629 156 +63 81 +630 53 +631 62 +632 64 +633 62 +636 74 +639 82 +640 93 +641 60 +643 117 +644 64 +645 50 +646 66 +647 60 +648 52 +649 138 +65 38 +650 58 +651 91 +655 86 +656 71 +658 56 +659 62 +660 63 +661 67 +665 48 +666 59 +667 79 +668 93 +669 46 +670 56 +671 76 +674 50 +675 42 +676 103 +677 77 +679 179 +68 52 +680 75 +681 90 +684 118 +685 100 +686 86 +688 70 +69 83 +690 97 +691 52 +693 82 +694 83 +696 61 +697 112 +699 57 +70 55 +701 131 +702 70 +706 71 +707 106 +708 144 +71 74 +711 48 +712 50 +713 64 +714 48 +715 85 +716 93 +717 48 +718 69 +719 99 +72 37 +720 78 +721 82 +723 101 +724 59 +725 50 +726 95 +727 76 +728 87 +729 46 +730 72 +731 72 +732 60 +733 88 +735 84 +736 53 +74 54 +740 92 +741 77 +744 100 +746 70 +747 66 +749 72 +75 94 +750 62 +751 67 +752 100 +753 72 +754 67 +755 60 +756 49 +757 110 +758 76 +759 66 +76 80 +761 78 +762 54 +763 133 +765 46 +766 130 +767 63 +768 105 +769 97 +77 65 +771 85 +772 55 +773 65 +774 90 +775 60 +776 109 +777 102 +78 55 +780 56 +781 62 +782 74 +783 41 +784 48 +785 79 +786 92 +787 63 +788 70 +79 94 +790 51 +791 48 +792 59 +793 38 +795 73 +798 57 +799 87 +8 123 +80 70 +802 55 +803 66 +804 73 +805 50 +810 51 +811 79 +812 69 +813 42 +82 121 +83 59 +86 99 +87 77 +90 100 +91 66 +93 76 +94 102 +95 63 +96 78 +98 68 +99 96 diff --git a/exp/tts_stats_raw_phn_none/train/text_shape.phn b/exp/tts_stats_raw_phn_none/train/text_shape.phn new file mode 100644 index 0000000000000000000000000000000000000000..41971537d821f50003b6ef91596d1e9c590037a6 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/train/text_shape.phn @@ -0,0 +1,1381 @@ +1 93,79 +100 71,79 +101 54,79 +103 96,79 +104 82,79 +105 66,79 +106 98,79 +108 74,79 +109 70,79 +110 74,79 +111 64,79 +112 85,79 +113 64,79 +114 92,79 +116 114,79 +117 128,79 +118 106,79 +124 119,79 +127 89,79 +13 49,79 +132 102,79 +133 142,79 +135 74,79 +136 58,79 +137 60,79 +139 139,79 +14 109,79 +141 59,79 +142 68,79 +143 135,79 +144 65,79 +145 77,79 +14502 79,79 +14507 94,79 +14520 80,79 +14522 94,79 +14523 48,79 +14532 84,79 +14538 101,79 +14540 74,79 +14551 116,79 +146 54,79 +14602 114,79 +14612 73,79 +14626 82,79 +14630 79,79 +14631 104,79 +14632 133,79 +14636 71,79 +14643 47,79 +147 48,79 +14731 121,79 +14732 76,79 +14733 64,79 +14736 57,79 +14737 95,79 +14739 102,79 +14757 134,79 +14761 54,79 +14763 105,79 +14764 103,79 +14780 89,79 +14781 98,79 +14783 88,79 +14784 84,79 +14786 101,79 +14799 102,79 +148 83,79 +14802 137,79 +14813 97,79 +14818 112,79 +14820 79,79 +14822 105,79 +14840 130,79 +14841 81,79 +14845 142,79 +14856 132,79 +14858 107,79 +14865 64,79 +14876 81,79 +14878 96,79 +14879 96,79 +14880 156,79 +14881 110,79 +14882 132,79 +14883 97,79 +14890 100,79 +14892 178,79 +14896 71,79 +149 99,79 +14902 130,79 +14906 113,79 +14910 72,79 +14916 140,79 +14919 82,79 +14920 67,79 +14924 80,79 +14931 137,79 +14932 109,79 +14933 107,79 +14934 149,79 +14938 144,79 +14942 127,79 +14943 124,79 +14944 122,79 +14950 126,79 +14951 142,79 +14954 77,79 +14955 103,79 +14956 144,79 +14957 113,79 +14958 171,79 +14959 123,79 +14960 88,79 +14961 95,79 +14962 82,79 +14963 132,79 +14964 137,79 +14966 89,79 +14967 58,79 +14973 96,79 +14983 102,79 +14986 91,79 +14988 79,79 +14989 99,79 +14990 91,79 +14992 78,79 +14997 73,79 +15 67,79 +15000 93,79 +15001 55,79 +15002 50,79 +15004 75,79 +15006 58,79 +15010 86,79 +15016 114,79 +15018 146,79 +15019 74,79 +15020 72,79 +15022 89,79 +15025 87,79 +15026 114,79 +15034 84,79 +15042 66,79 +15043 128,79 +15044 139,79 +15086 129,79 +151 45,79 +15195 117,79 +15202 120,79 +15207 113,79 +15212 179,79 +15221 82,79 +15225 102,79 +15268 85,79 +15284 48,79 +15285 57,79 +153 62,79 +15322 87,79 +15483 90,79 +155 54,79 +15501 58,79 +15518 60,79 +15520 92,79 +156 76,79 +15637 82,79 +15638 105,79 +15640 99,79 +15658 116,79 +15687 85,79 +157 73,79 +15703 98,79 +15706 75,79 +15707 75,79 +15720 46,79 +15721 134,79 +15763 112,79 +158 75,79 +15805 108,79 +15834 59,79 +15861 74,79 +15897 47,79 +159 54,79 +15921 43,79 +160 49,79 +161 72,79 +16124 98,79 +16132 91,79 +16136 55,79 +16202 51,79 +16296 46,79 +16297 86,79 +163 46,79 +16324 64,79 +16345 77,79 +16346 152,79 +16347 162,79 +16348 94,79 +16349 116,79 +16350 97,79 +16351 102,79 +16368 114,79 +164 62,79 +16407 68,79 +16412 60,79 +16413 208,79 +16414 129,79 +166 64,79 +167 73,79 +16765 64,79 +17 66,79 +172 62,79 +173 54,79 +175 66,79 +176 76,79 +177 102,79 +178 132,79 +179 103,79 +18 52,79 +181 86,79 +18132 92,79 +18133 175,79 +18134 194,79 +18135 140,79 +18136 191,79 +18137 172,79 +18149 120,79 +18150 88,79 +18151 104,79 +18155 182,79 +18156 90,79 +18157 107,79 +18158 122,79 +18159 147,79 +18160 152,79 +18161 169,79 +18162 98,79 +18163 106,79 +18164 112,79 +18166 136,79 +18168 156,79 +18169 163,79 +18171 204,79 +18172 122,79 +18176 155,79 +18177 102,79 +18178 77,79 +18180 106,79 +18181 203,79 +18184 122,79 +18185 111,79 +18187 147,79 +18190 148,79 +18194 116,79 +182 141,79 +18215 45,79 +18216 103,79 +18219 101,79 +18220 116,79 +18223 76,79 +18224 83,79 +18225 101,79 +18227 94,79 +18230 118,79 +18231 67,79 +18232 158,79 +18233 95,79 +18234 69,79 +18235 66,79 +18238 48,79 +18239 66,79 +18240 98,79 +18242 74,79 +18244 64,79 +18247 43,79 +18252 95,79 +18296 65,79 +183 89,79 +18307 79,79 +18308 67,79 +18326 73,79 +18337 75,79 +18356 58,79 +18375 57,79 +18377 66,79 +18378 57,79 +18380 107,79 +18387 79,79 +18392 95,79 +18394 121,79 +184 53,79 +18402 90,79 +18406 100,79 +18410 49,79 +18415 68,79 +18418 99,79 +18423 68,79 +18427 123,79 +18432 101,79 +18438 71,79 +18446 69,79 +18455 66,79 +18458 103,79 +18470 80,79 +18471 61,79 +18475 118,79 +18482 73,79 +18483 95,79 +18497 60,79 +18498 52,79 +185 120,79 +18503 101,79 +18509 81,79 +18512 104,79 +18535 91,79 +18538 62,79 +18541 34,79 +18543 62,79 +18546 89,79 +18548 66,79 +18552 104,79 +18553 68,79 +18559 82,79 +18562 40,79 +18564 79,79 +18567 94,79 +18568 65,79 +18569 93,79 +18570 58,79 +18571 103,79 +18578 72,79 +18579 65,79 +18580 45,79 +18582 108,79 +18584 67,79 +18590 92,79 +186 83,79 +18601 73,79 +18606 107,79 +18610 52,79 +18624 121,79 +18639 114,79 +18641 62,79 +18642 97,79 +18646 67,79 +18655 64,79 +18658 80,79 +18670 62,79 +18680 117,79 +18681 117,79 +18686 77,79 +187 100,79 +18706 166,79 +18707 107,79 +18709 81,79 +18711 68,79 +18720 65,79 +18747 109,79 +18755 49,79 +18758 106,79 +18760 67,79 +18768 119,79 +18769 61,79 +18772 72,79 +18786 104,79 +18796 68,79 +18804 66,79 +18813 70,79 +18835 112,79 +18837 104,79 +18838 122,79 +18841 111,79 +18851 113,79 +18858 103,79 +18861 67,79 +18862 120,79 +18867 113,79 +18870 138,79 +18873 103,79 +18886 121,79 +18893 119,79 +18899 86,79 +189 92,79 +18904 90,79 +18906 82,79 +18908 59,79 +18909 76,79 +18910 100,79 +18912 87,79 +18915 72,79 +18919 107,79 +18934 103,79 +18935 66,79 +18936 77,79 +18943 94,79 +18944 90,79 +18947 93,79 +18951 66,79 +18955 116,79 +18959 120,79 +18964 81,79 +18982 54,79 +18989 85,79 +18991 114,79 +18993 100,79 +18997 45,79 +19 58,79 +19001 132,79 +19005 97,79 +19010 82,79 +19011 97,79 +19015 72,79 +19024 90,79 +19028 71,79 +19063 115,79 +19065 84,79 +19067 83,79 +19075 78,79 +19076 112,79 +19090 92,79 +19091 108,79 +19095 62,79 +19096 89,79 +19099 87,79 +191 70,79 +19103 68,79 +19109 75,79 +19111 80,79 +19113 45,79 +19116 87,79 +19118 97,79 +19121 74,79 +19122 87,79 +19132 69,79 +19138 75,79 +19141 52,79 +19142 56,79 +19157 74,79 +19160 74,79 +19163 128,79 +19165 120,79 +19177 99,79 +19180 111,79 +19181 144,79 +19194 89,79 +19197 89,79 +192 62,79 +19201 155,79 +19211 101,79 +19212 131,79 +19213 99,79 +19218 108,79 +19224 90,79 +19225 59,79 +19229 64,79 +19234 96,79 +19237 90,79 +19241 95,79 +19251 97,79 +19263 45,79 +19267 73,79 +19271 54,79 +19276 76,79 +19280 83,79 +19281 104,79 +19295 87,79 +19298 53,79 +19304 86,79 +19310 67,79 +19316 68,79 +19321 109,79 +19325 80,79 +19327 93,79 +19333 107,79 +19337 117,79 +19347 118,79 +19348 113,79 +19357 106,79 +19360 89,79 +19366 85,79 +19367 92,79 +19371 64,79 +19372 84,79 +19374 75,79 +19376 103,79 +19387 108,79 +19396 57,79 +19399 54,79 +194 62,79 +19400 92,79 +19404 99,79 +19406 103,79 +19410 102,79 +19413 67,79 +19414 60,79 +19423 90,79 +19429 90,79 +19439 51,79 +19440 33,79 +19449 92,79 +19451 67,79 +19454 61,79 +19477 97,79 +19482 84,79 +19488 93,79 +19496 63,79 +19499 65,79 +195 68,79 +19501 60,79 +19506 96,79 +19509 80,79 +19510 57,79 +19511 77,79 +19521 61,79 +19522 80,79 +19524 67,79 +19529 91,79 +19540 101,79 +19542 107,79 +19543 78,79 +19548 61,79 +19550 88,79 +19551 85,79 +19554 119,79 +19556 95,79 +19558 140,79 +19565 149,79 +19569 89,79 +19576 70,79 +19581 69,79 +19584 61,79 +19585 48,79 +19587 54,79 +19590 84,79 +19595 99,79 +19598 60,79 +196 72,79 +19601 117,79 +19604 90,79 +19605 92,79 +19608 79,79 +19611 96,79 +19612 121,79 +19621 90,79 +19622 117,79 +19627 113,79 +19631 87,79 +19635 111,79 +19638 77,79 +19651 80,79 +19658 100,79 +19659 71,79 +19673 80,79 +19676 101,79 +19683 66,79 +19684 111,79 +19688 88,79 +19692 90,79 +19695 59,79 +197 61,79 +19700 81,79 +19702 78,79 +19703 92,79 +19705 94,79 +19713 85,79 +19714 108,79 +19723 90,79 +19726 79,79 +19748 46,79 +19749 113,79 +19753 46,79 +19760 80,79 +19763 89,79 +19764 84,79 +19775 55,79 +19778 52,79 +19782 107,79 +19783 151,79 +19785 59,79 +19787 88,79 +19789 69,79 +19791 90,79 +19797 74,79 +19798 121,79 +198 84,79 +19801 58,79 +19812 82,79 +19826 60,79 +19851 104,79 +19854 56,79 +19855 91,79 +19864 90,79 +19874 95,79 +19875 72,79 +19877 85,79 +19884 72,79 +19888 104,79 +19889 107,79 +199 85,79 +19902 66,79 +19903 116,79 +19907 116,79 +19913 74,79 +19917 63,79 +19918 106,79 +19920 108,79 +19926 122,79 +19928 71,79 +19931 84,79 +19935 117,79 +19938 58,79 +19944 71,79 +19946 52,79 +19947 71,79 +19948 80,79 +19949 64,79 +19951 133,79 +19952 102,79 +19955 64,79 +19957 81,79 +19959 106,79 +19976 97,79 +19979 66,79 +19981 66,79 +19984 80,79 +19990 127,79 +19998 100,79 +200 64,79 +20001 98,79 +20005 61,79 +20020 68,79 +20022 143,79 +20029 103,79 +20038 123,79 +20042 136,79 +20051 106,79 +20055 97,79 +20062 90,79 +20080 124,79 +20087 52,79 +20095 101,79 +201 67,79 +20109 82,79 +20119 64,79 +20120 93,79 +20121 83,79 +20128 67,79 +20144 69,79 +20147 78,79 +20183 70,79 +20185 80,79 +202 95,79 +20205 63,79 +20208 118,79 +20216 63,79 +20219 79,79 +20224 63,79 +20231 93,79 +20232 109,79 +20234 94,79 +20236 128,79 +20237 103,79 +20239 137,79 +20251 83,79 +20255 65,79 +20256 60,79 +20257 119,79 +20268 85,79 +20273 91,79 +20278 57,79 +20279 107,79 +20284 115,79 +20291 84,79 +20293 67,79 +20295 104,79 +20297 87,79 +20298 102,79 +20303 72,79 +20304 55,79 +20305 61,79 +20309 77,79 +20311 57,79 +20315 79,79 +20324 98,79 +20327 84,79 +20332 98,79 +20339 52,79 +20346 70,79 +20356 96,79 +20360 80,79 +20371 100,79 +20390 63,79 +20395 44,79 +20396 44,79 +20397 69,79 +20399 103,79 +204 84,79 +20402 82,79 +20406 77,79 +20408 39,79 +20410 53,79 +20413 49,79 +20422 77,79 +20427 97,79 +20435 101,79 +20438 97,79 +20440 56,79 +20442 108,79 +20445 89,79 +20447 58,79 +20461 156,79 +20464 94,79 +20465 87,79 +20482 62,79 +20484 86,79 +20488 115,79 +20489 86,79 +20495 71,79 +20496 84,79 +205 71,79 +20503 97,79 +20504 97,79 +20512 86,79 +20513 75,79 +20516 48,79 +20517 56,79 +20520 113,79 +20523 67,79 +20524 72,79 +20532 124,79 +20535 57,79 +20547 73,79 +20549 75,79 +20554 48,79 +20555 72,79 +20558 114,79 +20561 140,79 +20575 44,79 +20576 69,79 +20582 96,79 +20584 88,79 +20587 79,79 +20589 83,79 +20597 87,79 +206 90,79 +20604 103,79 +20605 88,79 +20606 86,79 +20607 77,79 +20616 68,79 +20617 89,79 +20621 96,79 +20622 89,79 +20629 99,79 +20633 49,79 +20635 66,79 +20643 90,79 +20644 92,79 +20645 72,79 +20663 130,79 +20665 45,79 +20668 93,79 +20673 69,79 +20674 72,79 +20675 73,79 +20679 86,79 +20698 74,79 +20699 84,79 +207 98,79 +20702 120,79 +20708 101,79 +20709 78,79 +20710 93,79 +20714 97,79 +20729 80,79 +20739 56,79 +20740 98,79 +20751 48,79 +20754 116,79 +20755 80,79 +208 112,79 +209 74,79 +20913 76,79 +20914 130,79 +20923 74,79 +20926 106,79 +20928 77,79 +20935 60,79 +20954 145,79 +20955 104,79 +20959 85,79 +20961 83,79 +20982 100,79 +20988 67,79 +20992 87,79 +21002 120,79 +21005 148,79 +21017 105,79 +21020 68,79 +21022 123,79 +21028 65,79 +21030 88,79 +21033 79,79 +21038 54,79 +21043 100,79 +21045 48,79 +21055 111,79 +21061 96,79 +21072 81,79 +21075 86,79 +21082 72,79 +21084 73,79 +21085 76,79 +21088 103,79 +21097 96,79 +21098 82,79 +211 70,79 +21100 126,79 +21101 145,79 +21103 78,79 +21107 74,79 +21109 85,79 +21113 72,79 +21115 63,79 +21129 98,79 +21139 100,79 +21143 85,79 +21144 93,79 +21147 74,79 +21148 65,79 +21158 76,79 +21165 110,79 +21170 83,79 +21175 92,79 +21176 62,79 +21178 66,79 +21192 93,79 +21193 63,79 +21199 65,79 +212 68,79 +21201 89,79 +21210 87,79 +21214 89,79 +21228 83,79 +21235 118,79 +21239 121,79 +21248 90,79 +21252 115,79 +21266 95,79 +21269 103,79 +21270 88,79 +21271 90,79 +21282 129,79 +21287 63,79 +21290 50,79 +213 82,79 +21307 82,79 +21308 101,79 +21309 97,79 +21310 105,79 +21312 93,79 +21348 74,79 +21349 56,79 +21362 114,79 +21363 124,79 +21366 85,79 +21368 40,79 +21372 64,79 +21375 64,79 +21379 76,79 +21382 74,79 +21385 82,79 +21388 74,79 +21397 62,79 +214 52,79 +21401 121,79 +21402 89,79 +21403 68,79 +21411 70,79 +21412 97,79 +21414 97,79 +21420 84,79 +21422 69,79 +21424 106,79 +21425 77,79 +21433 104,79 +21447 86,79 +21449 91,79 +21475 97,79 +21479 101,79 +21481 39,79 +21492 73,79 +21498 79,79 +21500 82,79 +21502 90,79 +21503 100,79 +21505 83,79 +21511 58,79 +21515 127,79 +21517 82,79 +21521 74,79 +21525 107,79 +21526 61,79 +21536 54,79 +21539 96,79 +21540 45,79 +21544 75,79 +21546 46,79 +21547 58,79 +21560 65,79 +21567 122,79 +21569 68,79 +21571 55,79 +21575 77,79 +21584 60,79 +21590 66,79 +21591 109,79 +21592 67,79 +21596 60,79 +21597 39,79 +21598 49,79 +21604 74,79 +21605 84,79 +21609 49,79 +21623 78,79 +21624 100,79 +21675 91,79 +21726 61,79 +21740 67,79 +21744 65,79 +218 84,79 +219 51,79 +22 49,79 +220 72,79 +222 85,79 +223 66,79 +227 68,79 +229 57,79 +231 72,79 +233 94,79 +234 82,79 +235 70,79 +236 66,79 +238 122,79 +240 87,79 +241 70,79 +242 58,79 +243 56,79 +244 64,79 +246 82,79 +249 81,79 +250 62,79 +251 68,79 +253 97,79 +254 50,79 +257 69,79 +258 65,79 +259 86,79 +260 70,79 +261 99,79 +264 64,79 +265 82,79 +266 105,79 +268 73,79 +269 65,79 +271 108,79 +272 81,79 +274 81,79 +275 65,79 +277 96,79 +278 72,79 +281 100,79 +282 92,79 +283 57,79 +284 92,79 +288 83,79 +289 109,79 +290 88,79 +293 61,79 +294 60,79 +295 70,79 +297 60,79 +299 70,79 +3 49,79 +300 66,79 +301 122,79 +303 65,79 +304 71,79 +305 54,79 +307 80,79 +308 48,79 +309 54,79 +310 72,79 +311 54,79 +312 50,79 +313 66,79 +314 58,79 +315 34,79 +317 52,79 +318 101,79 +32 88,79 +320 72,79 +322 68,79 +323 51,79 +325 41,79 +328 85,79 +33 45,79 +331 88,79 +333 52,79 +336 54,79 +337 54,79 +338 69,79 +339 78,79 +34 62,79 +340 76,79 +341 88,79 +342 92,79 +345 82,79 +346 94,79 +348 70,79 +35 52,79 +351 120,79 +354 60,79 +356 66,79 +357 71,79 +358 58,79 +36 59,79 +360 76,79 +361 54,79 +362 84,79 +363 48,79 +364 63,79 +365 82,79 +366 97,79 +367 65,79 +37 50,79 +370 76,79 +372 72,79 +373 126,79 +374 110,79 +375 66,79 +376 66,79 +377 69,79 +378 108,79 +379 77,79 +38 73,79 +381 86,79 +382 82,79 +384 69,79 +386 107,79 +388 67,79 +389 67,79 +392 127,79 +393 78,79 +394 57,79 +396 45,79 +398 71,79 +399 89,79 +40 62,79 +400 82,79 +401 65,79 +402 94,79 +404 51,79 +405 54,79 +406 64,79 +407 72,79 +41 53,79 +411 74,79 +413 72,79 +414 48,79 +415 60,79 +416 79,79 +42 64,79 +421 64,79 +422 60,79 +424 86,79 +425 97,79 +426 73,79 +427 51,79 +429 84,79 +43 61,79 +430 68,79 +431 89,79 +432 89,79 +433 58,79 +434 78,79 +436 82,79 +438 103,79 +440 98,79 +441 72,79 +442 77,79 +443 55,79 +445 52,79 +446 104,79 +448 54,79 +449 69,79 +45 43,79 +450 82,79 +452 84,79 +453 50,79 +454 71,79 +459 82,79 +46 87,79 +460 100,79 +461 53,79 +462 46,79 +463 79,79 +465 79,79 +466 70,79 +467 43,79 +469 68,79 +470 60,79 +472 68,79 +473 50,79 +474 74,79 +475 81,79 +476 96,79 +479 54,79 +48 53,79 +480 85,79 +481 72,79 +482 57,79 +483 50,79 +484 76,79 +485 64,79 +487 69,79 +488 50,79 +489 81,79 +49 36,79 +490 52,79 +492 60,79 +493 66,79 +495 58,79 +496 66,79 +497 56,79 +499 68,79 +5 65,79 +50 47,79 +500 85,79 +501 61,79 +502 46,79 +503 35,79 +505 54,79 +506 86,79 +507 71,79 +509 82,79 +51 50,79 +510 67,79 +513 57,79 +515 61,79 +518 80,79 +519 60,79 +52 70,79 +522 71,79 +523 46,79 +524 92,79 +525 66,79 +528 93,79 +530 66,79 +534 47,79 +535 57,79 +536 55,79 +537 65,79 +552 91,79 +557 103,79 +558 67,79 +561 78,79 +562 98,79 +564 50,79 +566 56,79 +567 82,79 +569 68,79 +570 77,79 +571 64,79 +572 67,79 +574 53,79 +577 73,79 +578 75,79 +579 64,79 +58 47,79 +580 72,79 +583 65,79 +584 52,79 +585 55,79 +586 65,79 +587 72,79 +590 44,79 +591 84,79 +592 78,79 +593 49,79 +595 54,79 +596 77,79 +598 139,79 +599 132,79 +6 98,79 +60 89,79 +601 74,79 +602 56,79 +603 97,79 +604 65,79 +605 88,79 +606 101,79 +607 61,79 +608 53,79 +609 67,79 +61 36,79 +610 100,79 +611 125,79 +613 64,79 +614 95,79 +617 108,79 +619 59,79 +62 38,79 +620 55,79 +621 53,79 +622 87,79 +623 50,79 +624 75,79 +625 86,79 +626 102,79 +627 64,79 +629 156,79 +63 81,79 +630 53,79 +631 62,79 +632 64,79 +633 62,79 +636 74,79 +639 82,79 +640 93,79 +641 60,79 +643 117,79 +644 64,79 +645 50,79 +646 66,79 +647 60,79 +648 52,79 +649 138,79 +65 38,79 +650 58,79 +651 91,79 +655 86,79 +656 71,79 +658 56,79 +659 62,79 +660 63,79 +661 67,79 +665 48,79 +666 59,79 +667 79,79 +668 93,79 +669 46,79 +670 56,79 +671 76,79 +674 50,79 +675 42,79 +676 103,79 +677 77,79 +679 179,79 +68 52,79 +680 75,79 +681 90,79 +684 118,79 +685 100,79 +686 86,79 +688 70,79 +69 83,79 +690 97,79 +691 52,79 +693 82,79 +694 83,79 +696 61,79 +697 112,79 +699 57,79 +70 55,79 +701 131,79 +702 70,79 +706 71,79 +707 106,79 +708 144,79 +71 74,79 +711 48,79 +712 50,79 +713 64,79 +714 48,79 +715 85,79 +716 93,79 +717 48,79 +718 69,79 +719 99,79 +72 37,79 +720 78,79 +721 82,79 +723 101,79 +724 59,79 +725 50,79 +726 95,79 +727 76,79 +728 87,79 +729 46,79 +730 72,79 +731 72,79 +732 60,79 +733 88,79 +735 84,79 +736 53,79 +74 54,79 +740 92,79 +741 77,79 +744 100,79 +746 70,79 +747 66,79 +749 72,79 +75 94,79 +750 62,79 +751 67,79 +752 100,79 +753 72,79 +754 67,79 +755 60,79 +756 49,79 +757 110,79 +758 76,79 +759 66,79 +76 80,79 +761 78,79 +762 54,79 +763 133,79 +765 46,79 +766 130,79 +767 63,79 +768 105,79 +769 97,79 +77 65,79 +771 85,79 +772 55,79 +773 65,79 +774 90,79 +775 60,79 +776 109,79 +777 102,79 +78 55,79 +780 56,79 +781 62,79 +782 74,79 +783 41,79 +784 48,79 +785 79,79 +786 92,79 +787 63,79 +788 70,79 +79 94,79 +790 51,79 +791 48,79 +792 59,79 +793 38,79 +795 73,79 +798 57,79 +799 87,79 +8 123,79 +80 70,79 +802 55,79 +803 66,79 +804 73,79 +805 50,79 +810 51,79 +811 79,79 +812 69,79 +813 42,79 +82 121,79 +83 59,79 +86 99,79 +87 77,79 +90 100,79 +91 66,79 +93 76,79 +94 102,79 +95 63,79 +96 78,79 +98 68,79 +99 96,79 diff --git a/exp/tts_stats_raw_phn_none/valid/feats_lengths_stats.npz b/exp/tts_stats_raw_phn_none/valid/feats_lengths_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..640ba6e945b68b2aa84a46542f615b98a93bc3a7 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/valid/feats_lengths_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db8d43046b976ac43e31f56050319ae1326d5cc8a8813a81e60685f6bf02b6ef +size 778 diff --git a/exp/tts_stats_raw_phn_none/valid/feats_stats.npz b/exp/tts_stats_raw_phn_none/valid/feats_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..35b96ab42a2d83234ab44b0bf6fd71e4a7e5fe46 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/valid/feats_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb2dfae83917834fd68fbec8c0f0e7504c4c524bf19c7b1298f9d4d1c958e24 +size 1402 diff --git a/exp/tts_stats_raw_phn_none/valid/speech_shape b/exp/tts_stats_raw_phn_none/valid/speech_shape new file mode 100644 index 0000000000000000000000000000000000000000..3787b0ff8fb9529533964fbf5203a51a31f0064e --- /dev/null +++ b/exp/tts_stats_raw_phn_none/valid/speech_shape @@ -0,0 +1,65 @@ +115 113152 +119 289536 +120 95488 +123 181760 +125 254720 +126 228864 +128 247808 +129 144896 +130 189184 +131 209920 +14616 165120 +14849 156672 +14891 265472 +14941 166144 +14991 157696 +15003 125184 +15079 148992 +15269 138496 +15665 165376 +169 189952 +18237 234496 +18774 153856 +18913 118784 +18963 129280 +19178 177408 +19312 155904 +19335 203264 +19517 130304 +19536 195328 +19769 152064 +19771 194816 +19975 173056 +20258 157440 +20265 127232 +20613 128000 +20642 164352 +20701 131072 +21 159232 +21440 155648 +21499 114944 +21601 180224 +280 184832 +286 92672 +287 146944 +296 109824 +409 143616 +458 153856 +531 134656 +538 86784 +539 175104 +540 228608 +545 121600 +547 169984 +549 102144 +551 164608 +554 107776 +559 238336 +560 162816 +588 187904 +672 129280 +673 244224 +678 148224 +698 236544 +739 167168 +808 133376 diff --git a/exp/tts_stats_raw_phn_none/valid/text_shape b/exp/tts_stats_raw_phn_none/valid/text_shape new file mode 100644 index 0000000000000000000000000000000000000000..de1237ca8e7f24750f4fc7b33250f73b95c557cc --- /dev/null +++ b/exp/tts_stats_raw_phn_none/valid/text_shape @@ -0,0 +1,65 @@ +115 51 +119 156 +120 47 +123 96 +125 117 +126 123 +128 129 +129 68 +130 88 +131 105 +14616 89 +14849 74 +14891 125 +14941 88 +14991 96 +15003 69 +15079 86 +15269 59 +15665 96 +169 104 +18237 134 +18774 47 +18913 60 +18963 58 +19178 91 +19312 84 +19335 124 +19517 73 +19536 114 +19769 84 +19771 108 +19975 91 +20258 83 +20265 62 +20613 60 +20642 78 +20701 64 +21 82 +21440 74 +21499 54 +21601 83 +280 103 +286 48 +287 72 +296 56 +409 79 +458 72 +531 68 +538 43 +539 84 +540 112 +545 57 +547 92 +549 51 +551 89 +554 59 +559 132 +560 80 +588 101 +672 52 +673 133 +678 70 +698 148 +739 80 +808 64 diff --git a/exp/tts_stats_raw_phn_none/valid/text_shape.phn b/exp/tts_stats_raw_phn_none/valid/text_shape.phn new file mode 100644 index 0000000000000000000000000000000000000000..3bdfa578ed0724ab5cdee22104c6f1f596e6b261 --- /dev/null +++ b/exp/tts_stats_raw_phn_none/valid/text_shape.phn @@ -0,0 +1,65 @@ +115 51,79 +119 156,79 +120 47,79 +123 96,79 +125 117,79 +126 123,79 +128 129,79 +129 68,79 +130 88,79 +131 105,79 +14616 89,79 +14849 74,79 +14891 125,79 +14941 88,79 +14991 96,79 +15003 69,79 +15079 86,79 +15269 59,79 +15665 96,79 +169 104,79 +18237 134,79 +18774 47,79 +18913 60,79 +18963 58,79 +19178 91,79 +19312 84,79 +19335 124,79 +19517 73,79 +19536 114,79 +19769 84,79 +19771 108,79 +19975 91,79 +20258 83,79 +20265 62,79 +20613 60,79 +20642 78,79 +20701 64,79 +21 82,79 +21440 74,79 +21499 54,79 +21601 83,79 +280 103,79 +286 48,79 +287 72,79 +296 56,79 +409 79,79 +458 72,79 +531 68,79 +538 43,79 +539 84,79 +540 112,79 +545 57,79 +547 92,79 +549 51,79 +551 89,79 +554 59,79 +559 132,79 +560 80,79 +588 101,79 +672 52,79 +673 133,79 +678 70,79 +698 148,79 +739 80,79 +808 64,79