{ "model_type": "wav2vec2-conformer", "auto_map": { "AutoModel": "modeling_conformer.Wav2Vec2ConformerRNNT" }, "vocab_size": 5632, "hidden_size": 512, "num_hidden_layers": 17, "num_attention_heads": 8, "intermediate_size": 2048, "hidden_act": "swish", "conv_depthwise_kernel_size": 31, "mask_time_prob": 0, "lstm_layer": 1, "pred_hidden": 640, "joint_hidden": 640, "sampling_rate": 16000, "max_symbols_per_step": 10, "apply_spec_augment": false, "feat_extract_activation": "relu", "feat_extract_norm": "layer", "conv_bias": true, "conv_stride": [2, 2], "conv_kernel": [3, 3], "conv_dim": [512, 512], "blank_id": 256, "languages": [ "as", "bn", "brx", "doi", "gu", "hi", "kn", "kok", "ks", "mai", "ml", "mni", "mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur" ] }