| accumulate_grad_batches: 1 |
| amp: true |
| attention_dropout: 0.1 |
| audio_num_mel_bins: 80 |
| audio_sample_rate: 16000 |
| base_config: |
| - configs/m2se_vtts.yaml |
| binary_data_dir: data/binary_data_pretrain_decoder |
| cfg_guidance_scale: 2.0 |
| clip_grad_norm: 1.0 |
| ddim_eta: 0.0 |
| ddim_steps: 100 |
| debug: false |
| dec_ffn_kernel_size: 9 |
| dec_layers: 4 |
| decoder_pretrain_mask_span: 10 |
| decoder_pretrain_mask_start_prob: 0.065 |
| decoder_pretrain_mel_mask: true |
| decoder_pretrain_spec_augment: true |
| default_num_caption_tokens: 16 |
| diff_decoder_type: transformer-F5Base |
| diff_loss_type: l2 |
| dit_attn_backend: torch |
| dit_attn_mask_enabled: true |
| dit_checkpoint_activations: true |
| dit_drop_path_rate: 0.15 |
| dit_dropout: 0.15 |
| dit_long_skip_connection: true |
| dit_pe_attn_head: 1 |
| dit_qk_norm: null |
| dropout: 0.1 |
| ds_workers: 4 |
| dur_loss: mse |
| dur_predictor_kernel: 3 |
| dur_predictor_layers: 2 |
| ema_decay: 0.9999 |
| enc_ffn_kernel_size: 9 |
| enc_layers: 4 |
| eval_audio_num_samples: 10 |
| ffn_act: gelu |
| ffn_hidden_size: 2048 |
| ffn_padding: SAME |
| fft_size: 1024 |
| fmax: 8000 |
| fmin: 0 |
| hidden_size: 512 |
| hop_size: 256 |
| infer: false |
| keep_bins: 80 |
| lambda_energy: 0.05 |
| lambda_f0: 0.5 |
| lambda_uv: 0.5 |
| lgsu_iterations: 2 |
| load_clip: false |
| lr: 0.0001 |
| max_epochs: 1000 |
| max_frames: 8000 |
| max_sentences: 128 |
| max_tokens: 128000 |
| max_updates: 160000 |
| max_valid_sentences: 128 |
| max_valid_tokens: 128000 |
| mfa_output_dir: data/processed_data/mfa/outputs |
| min_snr_gamma: 5 |
| num_ckpt_keep: 3 |
| num_heads: 8 |
| num_sanity_val_steps: 1 |
| num_valid_plots: 10 |
| optimizer_adam_beta2: 0.999 |
| persistent_workers: true |
| phone_set_path: data/binary_data/phone_set.json |
| pitch_loss: l1 |
| pitch_norm: standard |
| pitch_type: frame |
| predictor_dropout: 0.5 |
| predictor_grad: 0.1 |
| predictor_hidden: 256 |
| predictor_kernel: 5 |
| predictor_layers: 3 |
| prefetch_factor: 4 |
| pretrained_decoder_path: null |
| pretrained_encoder_path: null |
| print_nan_grads: false |
| processed_data_dir: data/processed_data |
| raw_data_dir: data/raw_data/soundspaces_speech |
| resume_from_checkpoint: 0 |
| rmvpe_ckpt: checkpoints/RMVPE/rmvpe.pt |
| save_best: true |
| save_codes: [] |
| schedule_type: cosine |
| scheduler_type: cosine |
| seed: 1234 |
| self_condition: false |
| sort_by_len: true |
| spatial_num_heads: 16 |
| spec_aug_prob: 0.5 |
| spec_augment_freq_masks: 2 |
| spec_augment_freq_width: 10 |
| spec_augment_time_masks: 2 |
| spec_augment_time_width: 50 |
| spec_max: |
| - 2.1879 |
| - 1.8991 |
| - 2.1358 |
| - 2.1123 |
| - 2.1055 |
| - 2.1296 |
| - 2.2195 |
| - 2.136 |
| - 2.089 |
| - 2.0317 |
| - 2.182 |
| - 2.0508 |
| - 1.9991 |
| - 2.0789 |
| - 2.1077 |
| - 1.9954 |
| - 2.0502 |
| - 2.0491 |
| - 1.9095 |
| - 1.8531 |
| - 1.9297 |
| - 1.8946 |
| - 1.844 |
| - 1.9792 |
| - 1.8273 |
| - 1.9192 |
| - 1.7508 |
| - 1.7955 |
| - 1.6119 |
| - 1.6795 |
| - 1.7442 |
| - 1.5747 |
| - 1.5096 |
| - 1.6116 |
| - 1.3568 |
| - 1.579 |
| - 1.2652 |
| - 1.3127 |
| - 1.5129 |
| - 1.3126 |
| - 1.3471 |
| - 1.0709 |
| - 1.0851 |
| - 1.1595 |
| - 0.8298 |
| - 0.7789 |
| - 0.9075 |
| - 0.767 |
| - 0.9798 |
| - 0.7773 |
| - 0.5978 |
| - 0.8436 |
| - 0.7244 |
| - 0.8123 |
| - 0.9104 |
| - 0.8252 |
| - 0.8225 |
| - 0.7235 |
| - 0.6883 |
| - 0.8559 |
| - 0.8016 |
| - 0.783 |
| - 0.8467 |
| - 0.6792 |
| - 0.8935 |
| - 0.8483 |
| - 0.571 |
| - 0.7259 |
| - 0.7561 |
| - 0.8435 |
| - 0.6317 |
| - 0.6531 |
| - 0.4406 |
| - 0.3391 |
| - 0.3603 |
| - 0.2577 |
| - 0.3985 |
| - 0.538 |
| - -0.0428 |
| - -0.9947 |
| spec_min: |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| - -11.5129 |
| spk_embed_dim: 192 |
| task_cls: m2se_vtts.tasks.pretrain_task.DecoderPretrainTask |
| tb_log_interval: 10 |
| test_input_dir: '' |
| test_num: 100 |
| test_set_name: test_seen |
| text_dim: 768 |
| timesteps: 1000 |
| top_k_regions: 140 |
| uncond_prob: 0.15 |
| use_cfg_inference: true |
| use_controlnet_finetune: false |
| use_ddim: true |
| use_ema: true |
| use_energy_embed: true |
| use_gt_dur: false |
| use_gt_f0: false |
| use_pitch_embed: true |
| use_pos_embed: true |
| use_spec_augment: false |
| use_spk_embed: true |
| use_spk_id: false |
| use_uv: true |
| use_visual: false |
| uv_label_smoothing: 0.1 |
| val_check_interval: 2000 |
| val_prefixes: |
| - valid |
| - test_seen |
| - test_unseen |
| valid_monitor_key: val_loss |
| valid_monitor_mode: min |
| vision_dim: 1024 |
| vocoder: bigvgan |
| vocoder_ckpt: checkpoints/bigvgan/g_00076000 |
| vocoder_config: null |
| vt_enc_layers: 3 |
| warmup_updates: 4000 |
| weight_decay: 0.08 |
| win_size: 1024 |
| work_dir: checkpoints/pretrain_decoder_emilia |
|
|