{ "model" : { "fm-decoder-downsampling-factor" : "1,2,4,2,1", "fm-decoder-num-layers" : "2,2,4,4,4", "fm-decoder-cnn-module-kernel" : "31,15,7,15,31", "fm-decoder-feedforward-dim" : 1536, "fm-decoder-num-heads" : 4, "fm-decoder-dim" : 512, "text-encoder-downsampling-factor" : "1", "text-encoder-num-layers" : "4", "text-encoder-feedforward-dim" : 512, "text-encoder-cnn-module-kernel" : 9, "text-encoder-num-heads" : 4, "text-encoder-dim" : 192, "query-head-dim" : 32, "value-head-dim" : 32, "pos-head-dim" : 4, "pos-dim" : 48, "time-embed-dim" : 192, "text-embed-dim" : 192 }, "feature" : { "sampling_rate": 24000, "frame_shift_ms": 256 / 24000 * 1000, "feat_dim": 100, "n_fft" : 1024, "hop_length" : 256 } }