| { | |
| "model": { | |
| "fm_decoder_downsampling_factor": [ | |
| 1, | |
| 2, | |
| 4, | |
| 2, | |
| 1 | |
| ], | |
| "fm_decoder_num_layers": [ | |
| 2, | |
| 2, | |
| 4, | |
| 4, | |
| 4 | |
| ], | |
| "fm_decoder_cnn_module_kernel": [ | |
| 31, | |
| 15, | |
| 7, | |
| 15, | |
| 31 | |
| ], | |
| "fm_decoder_feedforward_dim": 1536, | |
| "fm_decoder_num_heads": 4, | |
| "fm_decoder_dim": 512, | |
| "text_encoder_num_layers": 4, | |
| "text_encoder_feedforward_dim": 512, | |
| "text_encoder_cnn_module_kernel": 9, | |
| "text_encoder_num_heads": 4, | |
| "text_encoder_dim": 192, | |
| "query_head_dim": 32, | |
| "value_head_dim": 12, | |
| "pos_head_dim": 4, | |
| "pos_dim": 48, | |
| "time_embed_dim": 192, | |
| "text_embed_dim": 192, | |
| "feat_dim": 100 | |
| }, | |
| "feature": { | |
| "type": "vocos", | |
| "n_fft": 1024, | |
| "hop_length": 256, | |
| "n_mels": 100, | |
| "sample_rate": 24000 | |
| }, | |
| "architectures": [ | |
| "LuxTTSForTextToSpeech" | |
| ] | |
| } |