{ "model_type": "dots_tts", "latent_dim": 128, "patch_size": 4, "cfg_droprate": 0.2, "LLM": { "model_name_or_path": "/prodcpfs/user/jiatu/logs/qwen25_15b" }, "PatchEncoder": { "num_layers": 24, "num_heads": 16, "hidden_size": 1024, "ffn_hidden_size": 4096, "modulation": false, "qkv_bias": false, "qk_norm": true, "attn_dropout": 0.0, "dropout": 0.1, "norm_layer": "RMSNorm", "alibi_bias": false, "rotary_bias": true, "rotary_theta": 10000.0, "input_dim": 128, "causal": true }, "DiT": { "num_layers": 18, "num_heads": 16, "hidden_size": 1024, "ffn_hidden_size": 4096, "modulation": true, "qkv_bias": false, "qk_norm": true, "attn_dropout": 0.0, "dropout": 0.0, "norm_layer": "RMSNorm", "alibi_bias": false, "rotary_bias": true, "rotary_theta": 10000.0 }, "vocoder": { "sample_rate": 48000, "upsample_rates": [ 10, 6, 4, 2, 2, 2 ], "upsample_kernel_sizes": [ 20, 12, 8, 4, 4, 4 ], "upsample_initial_channel": 1536, "resblock": "1", "resblock_kernel_sizes": [ 3, 7, 11 ], "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "downsample_rates": [ 2, 2, 2, 4, 6, 10 ], "downsample_channels": [ 12, 24, 48, 96, 192, 384, 768 ], "activation": "snakebeta", "snake_logscale": true, "latent_dim": 128, "causal": true, "mi_num_layers": 4, "causal_encoder": true, "use_bias_at_final": false, "use_tanh_at_final": false }, "fm_sigma": 0.0, "xvec_drop_rate": 0.2, "campplus_embedding_size": 512, "xvec_max_audio_seconds": 10.0, "meanflow": { "enabled": true, "use_duration_embedding": true }, "architectures": [ "DotsTTSForConditionalGeneration" ] }