{ "hidden_size": 768, "num_hidden_layers": 12, "num_attention_heads": 12, "intermediate_size": 3072, "hidden_dropout": 0.1, "attention_dropout": 0.1, "activation_dropout": 0.0, "layer_norm_eps": 1e-05, "feat_extract_norm": "group", "feat_extract_activation": "gelu", "feat_proj_dropout": 0.0, "use_rope": true, "rope_theta": 10000.0, "sample_rate": 16000, "teacher_model_name": "TuKoResearch/AuriStream100M_40Pred_BigAudioDataset_500k", "teacher_hidden_size": 768, "conv_dim": [ 512, 512, 512, 512, 512, 512, 512 ], "conv_stride": [ 5, 2, 2, 2, 2, 2, 2 ], "conv_kernel": [ 10, 3, 3, 3, 3, 2, 2 ], "conv_bias": false, "model_type": "distilled_speech", "auto_map": { "AutoConfig": "configuration_distilled_speech.DistilledSpeechConfig", "AutoModel": "modeling_distilled_speech.DistilledSpeechModel" }, "architectures": [ "DistilledSpeechModel" ], "training_step": 100000 }