{ "architectures": [ "MDDTransformer" ], "batch_first": true, "d_model": 32, "dim_feedforward": 256, "hidden_dropout_prob": 0.1, "input_channels": 2, "max_source_positions": 3000, "model_type": "mdd_transformer", "num_attention_heads_decoder": 2, "num_attention_heads_encoder": 4, "num_classes": 43, "num_cross_attention_heads": 2, "num_decoder_layers": 2, "num_encoder_layers": 2, "num_mel_bins": 80, "projector_activation": "softmax", "tokens": [ "SIL", "AA", "AE", "AH", "AO", "AW", "AX", "AY", "B", "CH", "D", "DH", "EH", "ER", "EY", "F", "G", "HH", "IH", "IY", "JH", "K", "L", "M", "N", "NG", "OW", "OY", "P", "R", "S", "SH", "T", "TH", "UH", "UW", "V", "W", "Y", "Z", "ZH" ], "torch_dtype": "float32", "transformers_version": "4.52.1" }