{ "architectures": [ "Mapperatorinator" ], "backbone_config": { "_name_or_path": "openai/whisper-small", "architectures": [ "WhisperForConditionalGeneration" ], "begin_suppress_tokens": null, "bos_token_id": 1, "d_model": 768, "decoder_attention_heads": 12, "decoder_ffn_dim": 3072, "decoder_layers": 12, "decoder_start_token_id": 1, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layers": 12, "eos_token_id": 2, "forced_decoder_ids": null, "max_length": 448, "max_source_positions": 2048, "max_target_positions": 5120, "model_type": "whisper", "num_hidden_layers": 12, "num_mel_bins": 464, "pad_token_id": 0, "tie_word_embeddings": false, "torch_dtype": "float32", "use_cache": false, "vocab_size": 4607 }, "backbone_model_name": "openai/whisper-small", "bos_token_id": 1, "cond_dim": 128, "cond_embeds": 3, "decoder_start_token_id": 1, "disable_compile": true, "do_sample": true, "do_style_embed": false, "embed_decoder_input": true, "eos_token_id": 2, "hidden_size": 768, "hop_length": 128, "init_std": 0.02, "input_features": true, "is_encoder_decoder": true, "max_length": null, "max_source_positions": 2048, "max_target_positions": 5120, "model_type": "mapperatorinator", "n_fft": 1024, "n_mels": 80, "num_attention_heads": 12, "num_classes": 80742, "num_hidden_layers": 12, "num_mappers": 3731, "pad_token_id": 0, "rhythm_token_end": 3280, "rhythm_token_start": 3, "rhythm_weight": 1.0, "sample_rate": 16000, "top_k": 0, "torch_dtype": "float32", "transformers_version": "4.47.0", "vocab_size": 4607, "vocab_size_in": 8339 }