| project_root: "." |
| evaluation: |
| checkpoint_path: "" |
| output_dir: "outputs" |
| test_set_path: "inputs/input.json" |
| negative_style_prompt: ${project_root}/public/vocal.npy |
| num_samples: null |
| batch_size: 1 |
| random_crop_style: false |
| vae_type: 'diffrhythm' |
| num_style_secs: 30 |
| ignore_style: false |
| use_prompt_style: false |
|
|
| dataset: |
| pattern: "placeholder" |
| shuffle: false |
| resample_by_duration_threshold: null |
| always_crop_from_beginning: true |
| always_use_style_index: 0 |
| |
| sample_kwargs: |
| cfg_range: |
| - 0.05 |
| - 1 |
| dual_cfg: |
| - 4.7 |
| - 2.5 |
| steps: 50 |
|
|
| model: |
| num_channels: 64 |
| cfm: |
| max_frames: ${max_frames} |
| num_channels: ${model.num_channels} |
| dual_drop_prob: [0.1, 0.5] |
| no_edit: true |
|
|
| dit: |
| max_frames: ${max_frames} |
| mel_dim: ${model.num_channels} |
| dim: 1408 |
| depth: 16 |
| heads: 32 |
| ff_mult: 4 |
| text_dim: 512 |
| conv_layers: 4 |
| grad_ckpt: true |
| use_implicit_duration: true |
|
|
| data: |
| train_dataset: |
| max_frames: ${max_frames} |
| multiple_styles: true |
| sampling_rate: 44100 |
| shuffle: true |
| silence_latent_path: ${project_root}/public/silience_latent.pt |
| tokenizer_path: ${project_root}/public/en_us_cmudict_ipa_forward.pt |
| lrc_upsample_factor: ${lrc_upsample_factor} |
| filler: average_sparse |
| phonemizer_checkpoint: ${project_root}/public/en_us_cmudict_ipa_forward.pt |
|
|
| |
| max_frames: 5000 |
| lrc_upsample_factor: 4 |
| seed: 42 |