Spaces:
Sleeping
Sleeping
| log_dir: ./Models/Finetune | |
| save_freq: 1 | |
| log_interval: 10 | |
| device: cuda | |
| epochs: 50 | |
| batch_size: 2 | |
| max_len: 160 # maximum number of frames | |
| pretrained_model: ./Models/Finetune/base_model_120k_vi.pth | |
| load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters | |
| debug: false | |
| data_params: | |
| train_data: data_22k/train_phn.txt | |
| val_data: data_22k/dev_phn.txt | |
| root_path: data_22k | |
| n_speakers: 152 | |
| symbol: #Total 189 symbols | |
| pad: "$" | |
| punctuation: ';:,.!?¡¿—…"«»“” ' | |
| letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" | |
| letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
| extend: "" | |
| symbol: | |
| pad | |
| punctuation | |
| letters | |
| letters_ipa | |
| extend | |
| preprocess_params: | |
| sr: 24000 | |
| spect_params: | |
| n_fft: 2048 | |
| win_length: 1200 | |
| hop_length: 300 | |
| training_strats: | |
| #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd' | |
| freeze_modules: [''] # Not updated when training. | |
| ignore_modules: ['spk_emb', 'spk_ln', 'style_ln', 'gate'] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them. | |
| model_params: | |
| dim_in: 64 | |
| hidden_dim: 512 | |
| max_conv_dim: 512 | |
| n_layer: 3 | |
| n_mels: 80 | |
| max_dur: 50 # maximum duration of a single phoneme | |
| style_dim: 128 # style vector size | |
| dropout: 0.2 | |
| ASR_params: | |
| input_dim: 80 | |
| hidden_dim: 256 | |
| n_layers: 6 | |
| token_embedding_dim: 512 | |
| JDC_params: | |
| num_class: 1 | |
| seq_len: 192 | |
| # config for decoder | |
| decoder: # either hifigan or istftnet or vocos | |
| type: hifigan | |
| resblock_kernel_sizes: [3,7,11] | |
| upsample_rates : [10,5,3,2] | |
| upsample_initial_channel: 512 | |
| resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] | |
| upsample_kernel_sizes: [20,10,6,4] | |
| # type: 'istftnet' | |
| # resblock_kernel_sizes: [3,7,11] | |
| # upsample_rates : [10, 6] | |
| # upsample_initial_channel: 512 | |
| # resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] | |
| # upsample_kernel_sizes: [20, 12] | |
| # gen_istft_n_fft: 20 | |
| # gen_istft_hop_size: 5 | |
| # type: 'vocos' | |
| # intermediate_dim: 1536 | |
| # num_layers: 8 | |
| # gen_istft_n_fft: 1200 | |
| # gen_istft_hop_size: 300 | |
| loss_params: | |
| lambda_mel: 5. # mel reconstruction loss | |
| lambda_gen: 1. # generator loss | |
| lambda_mono: 1. # monotonic alignment loss (TMA) | |
| lambda_s2s: 1. # sequence-to-sequence loss (TMA) | |
| lambda_F0: 1. # F0 reconstruction loss | |
| lambda_norm: 1. # norm reconstruction loss | |
| lambda_dur: 1. # duration loss | |
| lambda_ce: 20. # duration predictor probability output CE loss | |
| optimizer_params: | |
| lr: 0.0001 # generalx learning rate | |
| ft_lr: 0.00001 # learning rate for acoustic modules |