{ "model": { "fm_decoder_downsampling_factor": [1, 2, 4, 2, 1], "fm_decoder_num_layers": [2, 2, 4, 4, 4], "fm_decoder_cnn_module_kernel": [31, 15, 7, 15, 31], "fm_decoder_feedforward_dim": 1536, "fm_decoder_num_heads": 4, "fm_decoder_dim": 512, "text_encoder_num_layers": 4, "text_encoder_feedforward_dim": 512, "text_encoder_cnn_module_kernel": 9, "text_encoder_num_heads": 4, "text_encoder_dim": 192, "query_head_dim": 32, "value_head_dim": 12, "pos_head_dim": 4, "pos_dim": 48, "time_embed_dim": 192, "text_embed_dim": 192, "feat_dim": 100 }, "feature": { "sampling_rate": 24000, "type": "vocos" } }