| sample_rate: 24_000 |
| audio_backend: vocos |
|
|
| models: |
| - name: "ar+nar" |
| size: "full" |
| resp_levels: 8 |
| prom_levels: 8 |
| tasks: 8 |
| langs: 2 |
| tones: 1 |
| arch_type: retnet |
| training: False |
| version: 2 |
| dropout: 0.1 |
| capabilities: ["ar", "nar"] |
| experimental: |
| audio_embedding_sums: True |
|
|
| hyperparameters: |
| autotune: False |
| autotune_params: |
| start_profile_step: 1 |
| end_profile_step: 50 |
| num_tuning_micro_batch_sizes: 8 |
|
|
| batch_size: 16 |
| gradient_accumulation_steps: 8 |
| gradient_clipping: 1.0 |
| warmup_steps: 250 |
|
|
| optimizer: Prodigy |
| learning_rate: 1.0 |
| torch_optimizer: True |
| |
| scheduler: "" |
| torch_scheduler: True |
|
|
| evaluation: |
| batch_size: 16 |
| frequency: 1000 |
| size: 16 |
| |
| steps: 500 |
| ar_temperature: 0.95 |
| nar_temperature: 0.25 |
| load_disabled_engines: True |
|
|
| trainer: |
| |
| ddp: False |
| check_for_oom: False |
| iterations: 1_000_000 |
| |
| save_tag: step |
| save_on_oom: True |
| save_on_quit: True |
| save_frequency: 500 |
| export_on_save: True |
|
|
| keep_last_checkpoints: 8 |
|
|
| aggressive_optimizations: False |
| load_disabled_engines: False |
| gradient_checkpointing: True |
|
|
| |
| strict_loading: False |
| |
| |
| |
| |
| gc_mode: None |
|
|
| weight_dtype: bfloat16 |
| amp: True |
|
|
| backend: deepspeed |
| deepspeed: |
| inferencing: True |
| zero_optimization_level: 0 |
| use_compression_training: False |
|
|
| amp: False |
|
|
| load_webui: False |
|
|
| inference: |
| backend: deepspeed |
| audio_backend: "vocos" |
| normalize: False |
|
|
| weight_dtype: bfloat16 |
| amp: True |
|
|
| optimizations: |
| injects: False |
| replace: True |
|
|
| linear: False |
| embedding: False |
| optimizers: True |
|
|
| bitsandbytes: False |
| dadaptation: False |
| bitnet: False |
| fp8: False |
|
|
| dataset: |
| speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'" |
| speaker_group_getter: "lambda p: f'{p.parts[-3]}'" |
| speaker_languages: |
| ja: [] |
|
|
| use_hdf5: True |
| use_metadata: True |
| hdf5_flag: r |
| validate: True |
|
|
| workers: 6 |
| cache: True |
|
|
| duration_range: [3.0, 16.0] |
|
|
| random_utterance: 1.0 |
| max_prompts: 1 |
| prompt_duration_range: [3.0, 9.0] |
| |
| max_resps: 1 |
| p_resp_append: 0.25 |
|
|
| sample_type: path |
|
|
| tasks_list: [ "tts" ] |
|
|
| training: [] |
| validation: [] |
| noise: [] |