{ "activation_dropout": 0.0, "activation_function": "gelu", "apply_spec_augment": false, "architectures": [ "WhisperCTCforTransformers" ], "attention_dropout": 0.0, "begin_suppress_tokens": [ 220, 50256 ], "blank_token_id": 42, "bos_token_id": 50256, "classifier_proj_size": 256, "ctc_loss_reduction": "mean", "ctc_zero_infinity": false, "d_model": 384, "decoder_attention_heads": 6, "decoder_end_token_id": 41, "decoder_ffn_dim": 1536, "decoder_layerdrop": 0.0, "decoder_layers": 1, "decoder_size": 43, "decoder_start_token_id": 40, "dropout": 0.0, "encoder_attention_heads": 6, "encoder_ffn_dim": 1536, "encoder_layerdrop": 0.0, "encoder_layers": 4, "eos_token_id": 50256, "feature_size": 80, "init_std": 0.02, "is_encoder_decoder": true, "lexicon_path": "./lexicon.txt", "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_prob": 0.05, "max_source_positions": 1500, "max_target_positions": 448, "median_filter_width": 7, "model_type": "whisper", "num_hidden_layers": 4, "num_mel_bins": 80, "pad_token_id": 42, "scale_embedding": false, "size": "tiny", "tokens": [ "SIL", "AA", "AE", "AH", "AO", "AW", "AX", "AY", "B", "CH", "D", "DH", "EH", "ER", "EY", "F", "G", "HH", "IH", "IY", "JH", "K", "L", "M", "N", "NG", "OW", "OY", "P", "R", "S", "SH", "T", "TH", "UH", "UW", "V", "W", "Y", "Z", "ZH", "", "" ], "torch_dtype": "float32", "transformers_version": "4.52.3", "use_cache": true, "use_weighted_layer_sum": false, "vocab_size": 51864 }