he-shuwei's picture
Upload checkpoints/pretrain_decoder/config.yaml with huggingface_hub
233a12e verified
accumulate_grad_batches: 1
amp: true
attention_dropout: 0.1
audio_num_mel_bins: 80
audio_sample_rate: 16000
base_config:
- configs/m2se_vtts.yaml
binary_data_dir: data/binary_data_pretrain_decoder
cfg_guidance_scale: 2.0
clip_grad_norm: 1.0
ddim_eta: 0.0
ddim_steps: 100
debug: false
dec_ffn_kernel_size: 9
dec_layers: 4
decoder_pretrain_mask_span: 10
decoder_pretrain_mask_start_prob: 0.065
decoder_pretrain_mel_mask: true
decoder_pretrain_spec_augment: true
default_num_caption_tokens: 16
diff_decoder_type: transformer-F5Base
diff_loss_type: l2
dit_attn_backend: torch
dit_attn_mask_enabled: true
dit_checkpoint_activations: true
dit_drop_path_rate: 0.15
dit_dropout: 0.15
dit_long_skip_connection: true
dit_pe_attn_head: 1
dit_qk_norm: null
dropout: 0.1
ds_workers: 4
dur_loss: mse
dur_predictor_kernel: 3
dur_predictor_layers: 2
ema_decay: 0.9999
enc_ffn_kernel_size: 9
enc_layers: 4
eval_audio_num_samples: 10
ffn_act: gelu
ffn_hidden_size: 2048
ffn_padding: SAME
fft_size: 1024
fmax: 8000
fmin: 0
hidden_size: 512
hop_size: 256
infer: false
keep_bins: 80
lambda_energy: 0.05
lambda_f0: 0.5
lambda_uv: 0.5
lgsu_iterations: 2
load_clip: false
lr: 0.0001
max_epochs: 1000
max_frames: 8000
max_sentences: 128
max_tokens: 128000
max_updates: 160000
max_valid_sentences: 128
max_valid_tokens: 128000
mfa_output_dir: data/processed_data/mfa/outputs
min_snr_gamma: 5
num_ckpt_keep: 3
num_heads: 8
num_sanity_val_steps: 1
num_valid_plots: 10
optimizer_adam_beta2: 0.999
persistent_workers: true
phone_set_path: data/binary_data/phone_set.json
pitch_loss: l1
pitch_norm: standard
pitch_type: frame
predictor_dropout: 0.5
predictor_grad: 0.1
predictor_hidden: 256
predictor_kernel: 5
predictor_layers: 3
prefetch_factor: 4
pretrained_decoder_path: null
pretrained_encoder_path: null
print_nan_grads: false
processed_data_dir: data/processed_data
raw_data_dir: data/raw_data/soundspaces_speech
resume_from_checkpoint: 0
rmvpe_ckpt: checkpoints/RMVPE/rmvpe.pt
save_best: true
save_codes: []
schedule_type: cosine
scheduler_type: cosine
seed: 1234
self_condition: false
sort_by_len: true
spatial_num_heads: 16
spec_aug_prob: 0.5
spec_augment_freq_masks: 2
spec_augment_freq_width: 10
spec_augment_time_masks: 2
spec_augment_time_width: 50
spec_max:
- 2.1879
- 1.8991
- 2.1358
- 2.1123
- 2.1055
- 2.1296
- 2.2195
- 2.136
- 2.089
- 2.0317
- 2.182
- 2.0508
- 1.9991
- 2.0789
- 2.1077
- 1.9954
- 2.0502
- 2.0491
- 1.9095
- 1.8531
- 1.9297
- 1.8946
- 1.844
- 1.9792
- 1.8273
- 1.9192
- 1.7508
- 1.7955
- 1.6119
- 1.6795
- 1.7442
- 1.5747
- 1.5096
- 1.6116
- 1.3568
- 1.579
- 1.2652
- 1.3127
- 1.5129
- 1.3126
- 1.3471
- 1.0709
- 1.0851
- 1.1595
- 0.8298
- 0.7789
- 0.9075
- 0.767
- 0.9798
- 0.7773
- 0.5978
- 0.8436
- 0.7244
- 0.8123
- 0.9104
- 0.8252
- 0.8225
- 0.7235
- 0.6883
- 0.8559
- 0.8016
- 0.783
- 0.8467
- 0.6792
- 0.8935
- 0.8483
- 0.571
- 0.7259
- 0.7561
- 0.8435
- 0.6317
- 0.6531
- 0.4406
- 0.3391
- 0.3603
- 0.2577
- 0.3985
- 0.538
- -0.0428
- -0.9947
spec_min:
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
- -11.5129
spk_embed_dim: 192
task_cls: m2se_vtts.tasks.pretrain_task.DecoderPretrainTask
tb_log_interval: 10
test_input_dir: ''
test_num: 100
test_set_name: test_seen
text_dim: 768
timesteps: 1000
top_k_regions: 140
uncond_prob: 0.15
use_cfg_inference: true
use_controlnet_finetune: false
use_ddim: true
use_ema: true
use_energy_embed: true
use_gt_dur: false
use_gt_f0: false
use_pitch_embed: true
use_pos_embed: true
use_spec_augment: false
use_spk_embed: true
use_spk_id: false
use_uv: true
use_visual: false
uv_label_smoothing: 0.1
val_check_interval: 2000
val_prefixes:
- valid
- test_seen
- test_unseen
valid_monitor_key: val_loss
valid_monitor_mode: min
vision_dim: 1024
vocoder: bigvgan
vocoder_ckpt: checkpoints/bigvgan/g_00076000
vocoder_config: null
vt_enc_layers: 3
warmup_updates: 4000
weight_decay: 0.08
win_size: 1024
work_dir: checkpoints/pretrain_decoder_emilia