{ "seed": 42, "data": { "data_path": "/root/Audio-Data/", "mdct_config": "shore_tts/configs/mdct.json", "num_workers": 16, "min_length": 480, "max_length": 9600, "batch_size": 64, "epoch_shuffle": true }, "text": { "tokenizer_path": "checkpoints/vocab.json", "polyphone": true }, "model": { "dit": { "dim": 768, "depth": 22, "heads": 12, "dim_head": 64, "dropout": 0.1, "ff_mult": 2, "text_dim": 512, "text_mask_padding": true, "text_embedding_average_upsampling": false, "qk_norm": null, "conv_layers": 4, "pe_attn_head": 1, "attn_backend": "flash_attn", "attn_mask_enabled": true, "long_skip_connection": false, "checkpoint_activations": true }, "cfm": { "sigma": 0.0, "audio_drop_prob": 0.3, "cond_drop_prob": 0.2, "frac_lengths_mask": [ 0.7, 1.0 ] } }, "optim": { "optimizer_type": "muon_adamw", "lr": 0.0001, "weight_decay": 0.05, "grad_clip": 1.0, "muon_args": { "momentum": 0.95, "nesterov": true, "ns_steps": 5 }, "adamw_args": { "betas": [ 0.9, 0.95 ] } }, "scheduler": { "warmup_steps": 20000, "warmup_start_factor": 1e-08, "final_lr_scale": 1e-08 }, "train": { "epochs": 1000, "max_steps": 1000000, "grad_accumulation_steps": 1, "log_every_steps": 10, "timing_every_steps": 100, "save_every_steps": 20000, "last_per_updates": 1000, "keep_last_n_checkpoints": 10, "ema_decay": 0.9999, "precision": "bf16", "allow_tf32": false, "log_samples": { "enabled": true, "sample_index": 0, "sample_steps": 16, "cfg_strength": 1.0, "duration_factor": 2.0 }, "save_dir": "checkpoints/pretrain-200M", "resume_from": "checkpoints/pretrain-200M/model_last.pt", "tensorboard": { "enabled": true, "log_dir": "checkpoints/pretrain-200M/tensorboard" } } }