{ "preprocessor": { "sample_rate": 16000, "normalize": "per_feature", "window_size": 0.025, "window_stride": 0.01, "window": "hann", "features": 128, "n_fft": 512, "dither": 1e-05, "pad_to": 0, "pad_value": 0.0, "preemph": 0.97, "mag_power": 2.0 }, "encoder": { "feat_in": 128, "n_layers": 32, "d_model": 1024, "n_heads": 8, "ff_expansion_factor": 4, "subsampling_factor": 8, "self_attention_model": "rel_pos", "subsampling": "dw_striding", "conv_kernel_size": 9, "subsampling_conv_channels": 256, "pos_emb_max_len": 5000, "causal_downsampling": false, "use_bias": true, "xscaling": false, "subsampling_conv_chunking_factor": 1, "att_context_size": [ -1, -1 ] }, "transf_decoder": { "vocab_size": 16384, "hidden_size": 1024, "inner_size": 4096, "num_layers": 8, "num_attention_heads": 8, "pre_ln": true, "hidden_act": "relu", "pre_ln_final_layer_norm": true, "learn_positional_encodings": false, "max_sequence_length": 1024 }, "head": { "num_layers": 1, "hidden_size": 1024, "num_classes": 16384 }, "prompt_format": "canary2", "tokenizer": { "type": "sentencepiece", "model_path": "tokenizer.model" } }