| dataset: |
| training: [] |
| validation: [] |
| noise: [] |
| |
| speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" |
| |
| use_hdf5: True |
| use_metadata: True |
| hdf5_flag: r |
| validate: True |
|
|
| workers: 2 |
| cache: True |
|
|
| phones_range: [4, 256] |
| duration_range: [1.0, 16.0] |
|
|
| random_utterance: 1.0 |
| max_prompts: 3 |
| prompt_duration: 6.0 |
|
|
| sample_type: speaker |
|
|
| tasks_list: [ "tts" ] |
|
|
| models: |
| _prom_levels: 8 |
| _max_levels: 8 |
|
|
| _models: |
| - name: "ar+nar" |
| size: "full" |
| resp_levels: 8 |
| prom_levels: 8 |
| tasks: 8 |
| arch_type: "retnet" |
| training: True |
| version: 2 |
|
|
| hyperparameters: |
| batch_size: 8 |
| gradient_accumulation_steps: 32 |
| gradient_clipping: 100 |
| |
| optimizer: Prodigy |
| torch_optimizer: True |
| learning_rate: 1.0 |
| |
| scheduler_type: "" |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
|
|
| evaluation: |
| batch_size: 16 |
| frequency: 250 |
| size: 16 |
| |
| steps: 450 |
| ar_temperature: 0.95 |
| nar_temperature: 0.25 |
| load_disabled_engines: True |
|
|
| trainer: |
| iterations: 1_000_000 |
| |
| save_tag: step |
| save_on_oom: True |
| save_on_quit: True |
| save_frequency: 100 |
| export_on_save: True |
|
|
| keep_last_checkpoints: 4 |
|
|
| aggressive_optimizations: False |
| load_disabled_engines: False |
|
|
| |
| |
| |
| |
| |
| |
| gc_mode: None |
|
|
| weight_dtype: bfloat16 |
| amp: False |
|
|
| backend: deepspeed |
| deepspeed: |
| zero_optimization_level: 0 |
| use_compression_training: True |
|
|
| activation_checkpointing: True |
|
|
| inference: |
| use_vocos: True |
| normalize: False |
|
|
| weight_dtype: bfloat16 |
| amp: False |
|
|
| bitsandbytes: |
| enabled: False |
| injects: True |
| linear: True |
| embedding: True |
| |