accumulate_grad_batches: 1 amp: true attention_dropout: 0.1 audio_num_mel_bins: 80 audio_sample_rate: 16000 base_config: - configs/m2se_vtts.yaml binary_data_dir: data/binary_data_pretrain_decoder cfg_guidance_scale: 2.0 clip_grad_norm: 1.0 ddim_eta: 0.0 ddim_steps: 100 debug: false dec_ffn_kernel_size: 9 dec_layers: 4 decoder_pretrain_mask_span: 10 decoder_pretrain_mask_start_prob: 0.065 decoder_pretrain_mel_mask: true decoder_pretrain_spec_augment: true default_num_caption_tokens: 16 diff_decoder_type: transformer-F5Base diff_loss_type: l2 dit_attn_backend: torch dit_attn_mask_enabled: true dit_checkpoint_activations: true dit_drop_path_rate: 0.15 dit_dropout: 0.15 dit_long_skip_connection: true dit_pe_attn_head: 1 dit_qk_norm: null dropout: 0.1 ds_workers: 4 dur_loss: mse dur_predictor_kernel: 3 dur_predictor_layers: 2 ema_decay: 0.9999 enc_ffn_kernel_size: 9 enc_layers: 4 eval_audio_num_samples: 10 ffn_act: gelu ffn_hidden_size: 2048 ffn_padding: SAME fft_size: 1024 fmax: 8000 fmin: 0 hidden_size: 512 hop_size: 256 infer: false keep_bins: 80 lambda_energy: 0.05 lambda_f0: 0.5 lambda_uv: 0.5 lgsu_iterations: 2 load_clip: false lr: 0.0001 max_epochs: 1000 max_frames: 8000 max_sentences: 128 max_tokens: 128000 max_updates: 160000 max_valid_sentences: 128 max_valid_tokens: 128000 mfa_output_dir: data/processed_data/mfa/outputs min_snr_gamma: 5 num_ckpt_keep: 3 num_heads: 8 num_sanity_val_steps: 1 num_valid_plots: 10 optimizer_adam_beta2: 0.999 persistent_workers: true phone_set_path: data/binary_data/phone_set.json pitch_loss: l1 pitch_norm: standard pitch_type: frame predictor_dropout: 0.5 predictor_grad: 0.1 predictor_hidden: 256 predictor_kernel: 5 predictor_layers: 3 prefetch_factor: 4 pretrained_decoder_path: null pretrained_encoder_path: null print_nan_grads: false processed_data_dir: data/processed_data raw_data_dir: data/raw_data/soundspaces_speech resume_from_checkpoint: 0 rmvpe_ckpt: checkpoints/RMVPE/rmvpe.pt save_best: true save_codes: [] schedule_type: cosine scheduler_type: cosine seed: 1234 self_condition: false sort_by_len: true spatial_num_heads: 16 spec_aug_prob: 0.5 spec_augment_freq_masks: 2 spec_augment_freq_width: 10 spec_augment_time_masks: 2 spec_augment_time_width: 50 spec_max: - 2.1879 - 1.8991 - 2.1358 - 2.1123 - 2.1055 - 2.1296 - 2.2195 - 2.136 - 2.089 - 2.0317 - 2.182 - 2.0508 - 1.9991 - 2.0789 - 2.1077 - 1.9954 - 2.0502 - 2.0491 - 1.9095 - 1.8531 - 1.9297 - 1.8946 - 1.844 - 1.9792 - 1.8273 - 1.9192 - 1.7508 - 1.7955 - 1.6119 - 1.6795 - 1.7442 - 1.5747 - 1.5096 - 1.6116 - 1.3568 - 1.579 - 1.2652 - 1.3127 - 1.5129 - 1.3126 - 1.3471 - 1.0709 - 1.0851 - 1.1595 - 0.8298 - 0.7789 - 0.9075 - 0.767 - 0.9798 - 0.7773 - 0.5978 - 0.8436 - 0.7244 - 0.8123 - 0.9104 - 0.8252 - 0.8225 - 0.7235 - 0.6883 - 0.8559 - 0.8016 - 0.783 - 0.8467 - 0.6792 - 0.8935 - 0.8483 - 0.571 - 0.7259 - 0.7561 - 0.8435 - 0.6317 - 0.6531 - 0.4406 - 0.3391 - 0.3603 - 0.2577 - 0.3985 - 0.538 - -0.0428 - -0.9947 spec_min: - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 - -11.5129 spk_embed_dim: 192 task_cls: m2se_vtts.tasks.pretrain_task.DecoderPretrainTask tb_log_interval: 10 test_input_dir: '' test_num: 100 test_set_name: test_seen text_dim: 768 timesteps: 1000 top_k_regions: 140 uncond_prob: 0.15 use_cfg_inference: true use_controlnet_finetune: false use_ddim: true use_ema: true use_energy_embed: true use_gt_dur: false use_gt_f0: false use_pitch_embed: true use_pos_embed: true use_spec_augment: false use_spk_embed: true use_spk_id: false use_uv: true use_visual: false uv_label_smoothing: 0.1 val_check_interval: 2000 val_prefixes: - valid - test_seen - test_unseen valid_monitor_key: val_loss valid_monitor_mode: min vision_dim: 1024 vocoder: bigvgan vocoder_ckpt: checkpoints/bigvgan/g_00076000 vocoder_config: null vt_enc_layers: 3 warmup_updates: 4000 weight_decay: 0.08 win_size: 1024 work_dir: checkpoints/pretrain_decoder_emilia