| infer: | |
| n_steps: 32 | |
| cfg: 3 | |
| audio: | |
| hop_size: 480 | |
| sample_rate: 24000 | |
| max_length: 36000 | |
| n_fft: 1920 | |
| num_mels: 128 | |
| win_size: 1920 | |
| fmin: 0 | |
| fmax: 12000 | |
| mel_var: 8.14 | |
| mel_mean: -4.92 | |
| model: | |
| encoder: | |
| vocab_size: 3000 | |
| text_dim: 512 | |
| pitch_dim: 512 | |
| type_dim: 512 | |
| f0_bin: 361 | |
| f0_dim: 512 | |
| num_layers: 4 | |
| flow_matching: | |
| mel_dim: 128 | |
| hidden_size: 1024 | |
| num_layers: 22 | |
| num_heads: 16 | |
| cfg_drop_prob: 0.2 | |
| use_embedding: False | |
| cond_codebook_size: 512 | |
| cond_scale_factor: 1 | |
| sigma: 1e-5 | |
| time_scheduler: cos |