| { |
| "seed": 42, |
| "data": { |
| "data_path": "/root/Audio-Data/", |
| "mdct_config": "shore_tts/configs/mdct.json", |
| "num_workers": 16, |
| "min_length": 480, |
| "max_length": 9600, |
| "batch_size": 64, |
| "epoch_shuffle": true |
| }, |
| "text": { |
| "tokenizer_path": "checkpoints/vocab.json", |
| "polyphone": true |
| }, |
| "model": { |
| "dit": { |
| "dim": 768, |
| "depth": 22, |
| "heads": 12, |
| "dim_head": 64, |
| "dropout": 0.1, |
| "ff_mult": 2, |
| "text_dim": 512, |
| "text_mask_padding": true, |
| "text_embedding_average_upsampling": false, |
| "qk_norm": null, |
| "conv_layers": 4, |
| "pe_attn_head": 1, |
| "attn_backend": "flash_attn", |
| "attn_mask_enabled": true, |
| "long_skip_connection": false, |
| "checkpoint_activations": true |
| }, |
| "cfm": { |
| "sigma": 0.0, |
| "audio_drop_prob": 0.3, |
| "cond_drop_prob": 0.2, |
| "frac_lengths_mask": [ |
| 0.7, |
| 1.0 |
| ] |
| } |
| }, |
| "optim": { |
| "optimizer_type": "muon_adamw", |
| "lr": 0.0001, |
| "weight_decay": 0.05, |
| "grad_clip": 1.0, |
| "muon_args": { |
| "momentum": 0.95, |
| "nesterov": true, |
| "ns_steps": 5 |
| }, |
| "adamw_args": { |
| "betas": [ |
| 0.9, |
| 0.95 |
| ] |
| } |
| }, |
| "scheduler": { |
| "warmup_steps": 20000, |
| "warmup_start_factor": 1e-08, |
| "final_lr_scale": 1e-08 |
| }, |
| "train": { |
| "epochs": 1000, |
| "max_steps": 1000000, |
| "grad_accumulation_steps": 1, |
| "log_every_steps": 10, |
| "timing_every_steps": 100, |
| "save_every_steps": 20000, |
| "last_per_updates": 1000, |
| "keep_last_n_checkpoints": 10, |
| "ema_decay": 0.9999, |
| "precision": "bf16", |
| "allow_tf32": false, |
| "log_samples": { |
| "enabled": true, |
| "sample_index": 0, |
| "sample_steps": 16, |
| "cfg_strength": 1.0, |
| "duration_factor": 2.0 |
| }, |
| "save_dir": "checkpoints/pretrain-200M", |
| "resume_from": "checkpoints/pretrain-200M/model_last.pt", |
| "tensorboard": { |
| "enabled": true, |
| "log_dir": "checkpoints/pretrain-200M/tensorboard" |
| } |
| } |
| } |