| dataset: |
| training: [ |
| ] |
| validation: [ |
| ] |
| noise: [ |
| ] |
| |
| speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" |
| |
| use_hdf5: True |
| use_metadata: True |
| hdf5_flag: r |
| validate: True |
|
|
| workers: 4 |
| cache: True |
|
|
| phones_range: [4, 256] |
| duration_range: [1.0, 16.0] |
|
|
| random_utterance: 1.0 |
| max_prompts: 3 |
| prompt_duration: 3.0 |
|
|
| sample_type: speaker |
|
|
| tasks_list: ["tts"] |
|
|
| models: |
| _prom_levels: 4 |
| _max_levels: 8 |
|
|
| _models: |
| - name: "ar" |
| size: "full" |
| resp_levels: 1 |
| prom_levels: 2 |
| tasks: 8 |
| arch_type: "retnet" |
| training: True |
| - name: "nar" |
| size: "full" |
| resp_levels: 3 |
| prom_levels: 4 |
| tasks: 8 |
| arch_type: "retnet" |
| training: True |
|
|
|
|
| hyperparameters: |
| batch_size: 8 |
| gradient_accumulation_steps: 1 |
| gradient_clipping: 100 |
| |
| optimizer: AdamW |
| learning_rate: 1.0e-5 |
| |
| scheduler_type: "" |
|
|
| evaluation: |
| batch_size: 16 |
| frequency: 500 |
| size: 16 |
| |
| steps: 300 |
| ar_temperature: 0.95 |
| nar_temperature: 0.25 |
| load_disabled_engines: True |
|
|
| trainer: |
| iterations: 1_000_000 |
| |
| save_tag: step |
| save_on_oom: True |
| save_on_quit: True |
| save_frequency: 500 |
| export_on_save: True |
|
|
| keep_last_checkpoints: 4 |
|
|
| aggressive_optimizations: False |
| load_disabled_engines: False |
|
|
| load_state_dict: True |
| |
| gc_mode: None |
|
|
| weight_dtype: float32 |
| amp: False |
|
|
| backend: local |
| deepspeed: |
| zero_optimization_level: 0 |
| use_compression_training: True |
|
|
| inference: |
| weight_dtype: float32 |
| amp: False |
|
|
| use_vocos: True |
| normalize: False |
|
|
| recurrent_chunk_size: 0 |
| recurrent_forward: False |
|
|
| bitsandbytes: |
| enabled: False |
| injects: True |
| linear: True |
| embedding: True |
|
|
| device: cpu |