| config: conf/tuning/transfer_visinger.yaml |
| print_config: false |
| log_level: INFO |
| drop_last_iter: false |
| dry_run: false |
| iterator_type: sequence |
| valid_iterator_type: null |
| output_dir: exp/svs_visinger_normal |
| ngpu: 1 |
| seed: 777 |
| num_workers: 0 |
| num_att_plot: 3 |
| dist_backend: nccl |
| dist_init_method: env:// |
| dist_world_size: null |
| dist_rank: null |
| local_rank: 0 |
| dist_master_addr: null |
| dist_master_port: null |
| dist_launcher: null |
| multiprocessing_distributed: false |
| unused_parameters: true |
| sharded_ddp: false |
| cudnn_enabled: true |
| cudnn_benchmark: false |
| cudnn_deterministic: false |
| collect_stats: false |
| write_collected_feats: false |
| max_epoch: 500 |
| patience: null |
| val_scheduler_criterion: |
| - valid |
| - loss |
| early_stopping_criterion: |
| - valid |
| - loss |
| - min |
| best_model_criterion: |
| - - train |
| - total_count |
| - max |
| keep_nbest_models: 10 |
| nbest_averaging_interval: 0 |
| grad_clip: -1 |
| grad_clip_type: 2.0 |
| grad_noise: false |
| accum_grad: 1 |
| no_forward_run: false |
| resume: true |
| train_dtype: float32 |
| use_amp: false |
| log_interval: 50 |
| use_matplotlib: true |
| use_tensorboard: true |
| create_graph_in_tensorboard: false |
| use_wandb: false |
| wandb_project: null |
| wandb_id: null |
| wandb_entity: null |
| wandb_name: null |
| wandb_model_log_interval: -1 |
| detect_anomaly: false |
| use_lora: false |
| save_lora_only: true |
| lora_conf: {} |
| pretrain_path: null |
| init_param: [] |
| ignore_init_mismatch: false |
| freeze_param: [] |
| num_iters_per_epoch: 1000 |
| batch_size: 8 |
| valid_batch_size: null |
| batch_bins: 1000000 |
| valid_batch_bins: null |
| train_shape_file: |
| - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn |
| - exp/svs_stats_raw_phn_None_zh/train/singing_shape |
| valid_shape_file: |
| - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn |
| - exp/svs_stats_raw_phn_None_zh/valid/singing_shape |
| batch_type: sorted |
| valid_batch_type: null |
| fold_length: |
| - 150 |
| - 409600 |
| sort_in_batch: descending |
| shuffle_within_batch: false |
| sort_batch: descending |
| multiple_iterator: false |
| chunk_length: 500 |
| chunk_shift_ratio: 0.5 |
| num_cache_chunks: 1024 |
| chunk_excluded_key_prefixes: [] |
| chunk_default_fs: null |
| train_data_path_and_name_and_type: |
| - - dump/raw/tr_no_dev/text |
| - text |
| - text |
| - - dump/raw/tr_no_dev/wav.scp |
| - singing |
| - sound |
| - - dump/raw/tr_no_dev/label |
| - label |
| - duration |
| - - dump/raw/tr_no_dev/score.scp |
| - score |
| - score |
| - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp |
| - pitch |
| - npy |
| - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp |
| - feats |
| - npy |
| valid_data_path_and_name_and_type: |
| - - dump/raw/dev/text |
| - text |
| - text |
| - - dump/raw/dev/wav.scp |
| - singing |
| - sound |
| - - dump/raw/dev/label |
| - label |
| - duration |
| - - dump/raw/dev/score.scp |
| - score |
| - score |
| - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp |
| - pitch |
| - npy |
| - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp |
| - feats |
| - npy |
| allow_variable_data_keys: false |
| max_cache_size: 0.0 |
| max_cache_fd: 32 |
| allow_multi_rates: false |
| valid_max_cache_size: null |
| exclude_weight_decay: false |
| exclude_weight_decay_conf: {} |
| optim: adamw |
| optim_conf: |
| lr: 0.0002 |
| betas: |
| - 0.8 |
| - 0.99 |
| eps: 1.0e-09 |
| weight_decay: 0.0 |
| scheduler: exponentiallr |
| scheduler_conf: |
| gamma: 0.998 |
| optim2: adamw |
| optim2_conf: |
| lr: 0.0002 |
| betas: |
| - 0.8 |
| - 0.99 |
| eps: 1.0e-09 |
| weight_decay: 0.0 |
| scheduler2: exponentiallr |
| scheduler2_conf: |
| gamma: 0.998 |
| generator_first: true |
| token_list: |
| - <blank> |
| - <unk> |
| - SP |
| - i |
| - AP |
| - e |
| - y |
| - d |
| - w |
| - sh |
| - ai |
| - n |
| - x |
| - j |
| - ian |
| - u |
| - l |
| - h |
| - b |
| - o |
| - zh |
| - an |
| - ou |
| - m |
| - q |
| - z |
| - en |
| - g |
| - ing |
| - ei |
| - ao |
| - ang |
| - uo |
| - eng |
| - t |
| - a |
| - ong |
| - ui |
| - k |
| - f |
| - r |
| - iang |
| - ch |
| - v |
| - in |
| - iao |
| - ie |
| - iu |
| - c |
| - s |
| - van |
| - p |
| - ve |
| - uan |
| - uang |
| - ia |
| - ua |
| - uai |
| - un |
| - er |
| - vn |
| - iong |
| - <sos/eos> |
| odim: null |
| model_conf: {} |
| use_preprocessor: true |
| token_type: phn |
| bpemodel: null |
| non_linguistic_symbols: null |
| cleaner: null |
| g2p: null |
| fs: 44100 |
| score_feats_extract: syllable_score_feats |
| score_feats_extract_conf: |
| fs: 44100 |
| n_fft: 2048 |
| win_length: 2048 |
| hop_length: 512 |
| feats_extract: fbank |
| feats_extract_conf: |
| n_fft: 2048 |
| hop_length: 512 |
| win_length: 2048 |
| fs: 44100 |
| fmin: 0 |
| fmax: 22050 |
| n_mels: 80 |
| normalize: global_mvn |
| normalize_conf: |
| stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz |
| svs: vits |
| svs_conf: |
| generator_type: visinger |
| vocoder_generator_type: hifigan |
| generator_params: |
| hidden_channels: 192 |
| spks: -1 |
| global_channels: 256 |
| segment_size: 20 |
| text_encoder_attention_heads: 2 |
| text_encoder_ffn_expand: 4 |
| text_encoder_blocks: 6 |
| text_encoder_positionwise_layer_type: conv1d |
| text_encoder_positionwise_conv_kernel_size: 3 |
| text_encoder_positional_encoding_layer_type: rel_pos |
| text_encoder_self_attention_layer_type: rel_selfattn |
| text_encoder_activation_type: swish |
| text_encoder_normalize_before: true |
| text_encoder_dropout_rate: 0.1 |
| text_encoder_positional_dropout_rate: 0.0 |
| text_encoder_attention_dropout_rate: 0.1 |
| use_macaron_style_in_text_encoder: true |
| use_conformer_conv_in_text_encoder: false |
| text_encoder_conformer_kernel_size: -1 |
| decoder_kernel_size: 7 |
| decoder_channels: 512 |
| decoder_upsample_scales: |
| - 8 |
| - 8 |
| - 4 |
| - 2 |
| decoder_upsample_kernel_sizes: |
| - 16 |
| - 16 |
| - 8 |
| - 4 |
| decoder_resblock_kernel_sizes: |
| - 3 |
| - 7 |
| - 11 |
| decoder_resblock_dilations: |
| - - 1 |
| - 3 |
| - 5 |
| - - 1 |
| - 3 |
| - 5 |
| - - 1 |
| - 3 |
| - 5 |
| use_weight_norm_in_decoder: true |
| posterior_encoder_kernel_size: 3 |
| posterior_encoder_layers: 8 |
| posterior_encoder_stacks: 1 |
| posterior_encoder_base_dilation: 1 |
| posterior_encoder_dropout_rate: 0.0 |
| use_weight_norm_in_posterior_encoder: true |
| flow_flows: -1 |
| flow_kernel_size: 5 |
| flow_base_dilation: 1 |
| flow_layers: 4 |
| flow_dropout_rate: 0.0 |
| use_weight_norm_in_flow: true |
| use_only_mean_in_flow: true |
| use_phoneme_predictor: false |
| vocabs: 63 |
| aux_channels: 80 |
| generator_type: visinger |
| vocoder_generator_type: hifigan |
| fs: 44100 |
| hop_length: 512 |
| win_length: 2048 |
| n_fft: 2048 |
| discriminator_type: visinger2 |
| discriminator_params: |
| scales: 1 |
| scale_downsample_pooling: AvgPool1d |
| scale_downsample_pooling_params: |
| kernel_size: 4 |
| stride: 2 |
| padding: 2 |
| scale_discriminator_params: |
| in_channels: 1 |
| out_channels: 1 |
| kernel_sizes: |
| - 15 |
| - 41 |
| - 5 |
| - 3 |
| channels: 128 |
| max_downsample_channels: 1024 |
| max_groups: 256 |
| bias: true |
| downsample_scales: |
| - 4 |
| - 4 |
| - 4 |
| - 4 |
| nonlinear_activation: LeakyReLU |
| nonlinear_activation_params: |
| negative_slope: 0.1 |
| use_weight_norm: true |
| use_spectral_norm: false |
| follow_official_norm: false |
| periods: |
| - 2 |
| - 3 |
| - 5 |
| - 7 |
| - 11 |
| period_discriminator_params: |
| in_channels: 1 |
| out_channels: 1 |
| kernel_sizes: |
| - 5 |
| - 3 |
| channels: 32 |
| downsample_scales: |
| - 3 |
| - 3 |
| - 3 |
| - 3 |
| - 1 |
| max_downsample_channels: 1024 |
| bias: true |
| nonlinear_activation: LeakyReLU |
| nonlinear_activation_params: |
| negative_slope: 0.1 |
| use_weight_norm: true |
| use_spectral_norm: false |
| multi_freq_disc_params: |
| hidden_channels: |
| - 256 |
| - 256 |
| - 256 |
| - 256 |
| - 256 |
| domain: double |
| mel_scale: true |
| divisors: |
| - 32 |
| - 16 |
| - 8 |
| - 4 |
| - 2 |
| - 1 |
| - 1 |
| strides: |
| - 1 |
| - 2 |
| - 1 |
| - 2 |
| - 1 |
| - 2 |
| - 1 |
| sample_rate: 44100 |
| hop_lengths: |
| - 110 |
| - 220 |
| - 330 |
| - 441 |
| - 551 |
| - 661 |
| generator_adv_loss_params: |
| average_by_discriminators: false |
| loss_type: mse |
| discriminator_adv_loss_params: |
| average_by_discriminators: false |
| loss_type: mse |
| feat_match_loss_params: |
| average_by_discriminators: false |
| average_by_layers: false |
| include_final_outputs: true |
| mel_loss_params: |
| fs: 44100 |
| n_fft: 2048 |
| hop_length: 512 |
| win_length: 2048 |
| window: hann |
| n_mels: 80 |
| fmin: 0 |
| fmax: 22050 |
| log_base: null |
| lambda_adv: 1.0 |
| lambda_mel: 45.0 |
| lambda_feat_match: 2.0 |
| lambda_dur: 0.1 |
| lambda_pitch: 10.0 |
| lambda_phoneme: 1.0 |
| lambda_kl: 1.0 |
| sampling_rate: 44100 |
| cache_generator_outputs: true |
| pitch_extract: dio |
| pitch_extract_conf: |
| use_token_averaged_f0: false |
| use_log_f0: false |
| fs: 44100 |
| n_fft: 2048 |
| hop_length: 512 |
| f0max: 800 |
| f0min: 80 |
| pitch_normalize: null |
| pitch_normalize_conf: |
| stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz |
| ying_extract: null |
| ying_extract_conf: {} |
| energy_extract: null |
| energy_extract_conf: {} |
| energy_normalize: null |
| energy_normalize_conf: {} |
| required: |
| - output_dir |
| - token_list |
| version: '202310' |
| distributed: false |
|
|