log_dir: ./Models/Finetune save_freq: 1 log_interval: 10 device: cuda epochs: 50 batch_size: 2 max_len: 160 # maximum number of frames pretrained_model: ./Models/Finetune/base_model_120k_vi.pth load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters debug: false data_params: train_data: data_22k/train_phn.txt val_data: data_22k/dev_phn.txt root_path: data_22k n_speakers: 152 symbol: #Total 189 symbols pad: "$" punctuation: ';:,.!?¡¿—…"«»“” ' letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" extend: "" symbol: pad punctuation letters letters_ipa extend preprocess_params: sr: 24000 spect_params: n_fft: 2048 win_length: 1200 hop_length: 300 training_strats: #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd' freeze_modules: [''] # Not updated when training. ignore_modules: ['spk_emb', 'spk_ln', 'style_ln', 'gate'] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them. model_params: dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 max_dur: 50 # maximum duration of a single phoneme style_dim: 128 # style vector size dropout: 0.2 ASR_params: input_dim: 80 hidden_dim: 256 n_layers: 6 token_embedding_dim: 512 JDC_params: num_class: 1 seq_len: 192 # config for decoder decoder: # either hifigan or istftnet or vocos type: hifigan resblock_kernel_sizes: [3,7,11] upsample_rates : [10,5,3,2] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_kernel_sizes: [20,10,6,4] # type: 'istftnet' # resblock_kernel_sizes: [3,7,11] # upsample_rates : [10, 6] # upsample_initial_channel: 512 # resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] # upsample_kernel_sizes: [20, 12] # gen_istft_n_fft: 20 # gen_istft_hop_size: 5 # type: 'vocos' # intermediate_dim: 1536 # num_layers: 8 # gen_istft_n_fft: 1200 # gen_istft_hop_size: 300 loss_params: lambda_mel: 5. # mel reconstruction loss lambda_gen: 1. # generator loss lambda_mono: 1. # monotonic alignment loss (TMA) lambda_s2s: 1. # sequence-to-sequence loss (TMA) lambda_F0: 1. # F0 reconstruction loss lambda_norm: 1. # norm reconstruction loss lambda_dur: 1. # duration loss lambda_ce: 20. # duration predictor probability output CE loss optimizer_params: lr: 0.0001 # generalx learning rate ft_lr: 0.00001 # learning rate for acoustic modules