| { | |
| "ASR_config": { | |
| "batch_size": 64, | |
| "dataset_params": { | |
| "data_augmentation": false | |
| }, | |
| "device": "cuda", | |
| "epochs": 180, | |
| "log_dir": "logs/20201006", | |
| "model_params": { | |
| "hidden_dim": 256, | |
| "input_dim": 80, | |
| "n_token": 178, | |
| "token_embedding_dim": 512 | |
| }, | |
| "optimizer_params": { | |
| "lr": 0.0005 | |
| }, | |
| "preprocess_parasm": { | |
| "mel_params": { | |
| "n_mels": 80 | |
| }, | |
| "spect_params": { | |
| "hop_length": 300, | |
| "n_fft": 2048, | |
| "win_length": 1200 | |
| }, | |
| "sr": 24000 | |
| }, | |
| "pretrained_model": "", | |
| "save_freq": 5, | |
| "train_data": "ASRDataset/train_list.txt", | |
| "val_data": "ASRDataset/val_list.txt" | |
| }, | |
| "BERT_CONFIG": { | |
| "batch_size": 32, | |
| "data_folder": "wikipedia_20220301.en.processed", | |
| "dataset_params": { | |
| "max_mel_length": 512, | |
| "phoneme_mask_prob": 0.1, | |
| "replace_prob": 0.2, | |
| "token_maps": "token_maps.pkl", | |
| "token_mask": "M", | |
| "token_separator": " ", | |
| "tokenizer": "bert-base-multilingual-cased", | |
| "word_mask_prob": 0.15, | |
| "word_separator": 102 | |
| }, | |
| "log_dir": "Checkpoint_all_phonemes", | |
| "log_interval": 10, | |
| "mixed_precision": "fp16", | |
| "model_params": { | |
| "dropout": 0.1, | |
| "hidden_size": 768, | |
| "intermediate_size": 2048, | |
| "max_position_embeddings": 512, | |
| "num_attention_heads": 12, | |
| "num_hidden_layers": 12, | |
| "vocab_size": 178 | |
| }, | |
| "num_process": 1, | |
| "num_steps": 2000000, | |
| "save_interval": 20000 | |
| }, | |
| "LIBRI_TTS_CONFIG": { | |
| "ASR_config": "Utils/ASR/config.yml", | |
| "ASR_path": "Utils/ASR/epoch_00080.pth", | |
| "F0_path": "Utils/JDC/bst.t7", | |
| "PLBERT_dir": "Utils/PLBERT/", | |
| "batch_size": 8, | |
| "data_params": { | |
| "OOD_data": "Data/OOD_texts.txt", | |
| "min_length": 50, | |
| "root_path": "", | |
| "train_data": "Data/train_list.txt", | |
| "val_data": "Data/val_list.txt" | |
| }, | |
| "device": "cuda", | |
| "epochs_1st": 40, | |
| "epochs_2nd": 25, | |
| "first_stage_path": "first_stage.pth", | |
| "load_only_params": false, | |
| "log_dir": "Models/LibriTTS", | |
| "log_interval": 10, | |
| "loss_params": { | |
| "TMA_epoch": 4, | |
| "diff_epoch": 0, | |
| "joint_epoch": 0, | |
| "lambda_F0": 1.0, | |
| "lambda_ce": 20.0, | |
| "lambda_diff": 1.0, | |
| "lambda_dur": 1.0, | |
| "lambda_gen": 1.0, | |
| "lambda_mel": 5.0, | |
| "lambda_mono": 1.0, | |
| "lambda_norm": 1.0, | |
| "lambda_s2s": 1.0, | |
| "lambda_slm": 1.0, | |
| "lambda_sty": 1.0 | |
| }, | |
| "max_len": 300, | |
| "model_params": { | |
| "decoder": { | |
| "resblock_dilation_sizes": [ | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ], | |
| [ | |
| 1, | |
| 3, | |
| 5 | |
| ] | |
| ], | |
| "resblock_kernel_sizes": [ | |
| 3, | |
| 7, | |
| 11 | |
| ], | |
| "type": "hifigan", | |
| "upsample_initial_channel": 512, | |
| "upsample_kernel_sizes": [ | |
| 20, | |
| 10, | |
| 6, | |
| 4 | |
| ], | |
| "upsample_rates": [ | |
| 10, | |
| 5, | |
| 3, | |
| 2 | |
| ] | |
| }, | |
| "diffusion": { | |
| "dist": { | |
| "estimate_sigma_data": true, | |
| "mean": -3.0, | |
| "sigma_data": 0.19926648961191362, | |
| "std": 1.0 | |
| }, | |
| "embedding_mask_proba": 0.1, | |
| "transformer": { | |
| "head_features": 64, | |
| "multiplier": 2, | |
| "num_heads": 8, | |
| "num_layers": 3 | |
| } | |
| }, | |
| "dim_in": 64, | |
| "dropout": 0.2, | |
| "hidden_dim": 512, | |
| "max_conv_dim": 512, | |
| "max_dur": 50, | |
| "multispeaker": true, | |
| "n_layer": 3, | |
| "n_mels": 80, | |
| "n_token": 178, | |
| "slm": { | |
| "hidden": 768, | |
| "initial_channel": 64, | |
| "model": "microsoft/wavlm-base-plus", | |
| "nlayers": 13, | |
| "sr": 16000 | |
| }, | |
| "style_dim": 128 | |
| }, | |
| "optimizer_params": { | |
| "bert_lr": 1e-05, | |
| "ft_lr": 1e-05, | |
| "lr": 0.0001 | |
| }, | |
| "preprocess_params": { | |
| "spect_params": { | |
| "hop_length": 300, | |
| "n_fft": 2048, | |
| "win_length": 1200 | |
| }, | |
| "sr": 24000 | |
| }, | |
| "pretrained_model": "Models/LibriTTS/epoch_2nd_00002.pth", | |
| "save_freq": 1, | |
| "second_stage_load_pretrained": true, | |
| "slmadv_params": { | |
| "batch_percentage": 0.5, | |
| "iter": 20, | |
| "max_len": 500, | |
| "min_len": 400, | |
| "scale": 0.01, | |
| "sig": 1.5, | |
| "thresh": 5 | |
| } | |
| }, | |
| "config_path": null, | |
| "model_checkpoint_path": null, | |
| "phoneme_converter": "gruut" | |
| } |