{ "architectures": [ "Mapperatorinator" ], "backbone_config": { "_name_or_path": "openai/whisper-small", "activation_dropout": 0.0, "activation_function": "gelu", "apply_spec_augment": false, "architectures": [ "WhisperForConditionalGeneration" ], "attention_dropout": 0.0, "begin_suppress_tokens": null, "bos_token_id": 1, "classifier_proj_size": 256, "d_model": 768, "decoder_attention_heads": 12, "decoder_ffn_dim": 3072, "decoder_layerdrop": 0.0, "decoder_layers": 12, "decoder_start_token_id": 1, "dropout": 0.0, "encoder_attention_heads": 12, "encoder_ffn_dim": 3072, "encoder_layerdrop": 0.0, "encoder_layers": 12, "eos_token_id": 2, "forced_decoder_ids": null, "init_std": 0.02, "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_prob": 0.05, "max_length": 448, "max_source_positions": 2048, "max_target_positions": 8192, "median_filter_width": 7, "model_type": "whisper", "num_hidden_layers": 12, "num_mel_bins": 464, "pad_token_id": 0, "rope_decoder_scaling_factor": 1.0, "rope_encoder_scaling_factor": 1.0, "rope_type": "dynamic", "scale_embedding": false, "tie_word_embeddings": false, "torch_dtype": "float32", "use_cache": false, "use_weighted_layer_sum": false, "vocab_size": 5971 }, "backbone_model_name": "Tiger14n/ropewhisper-small", "bos_token_id": 1, "cond_dim": 128, "cond_size": 384, "decoder_start_token_id": 1, "disable_compile": true, "do_difficulty_embed": true, "do_mapper_embed": true, "do_sample": true, "do_song_position_embed": true, "do_style_embed": false, "embed_decoder_input": true, "eos_token_id": 2, "f_max": 8000, "f_min": 20, "hidden_size": 768, "hop_length": 128, "init_std": 0.02, "input_features": true, "is_encoder_decoder": true, "max_length": null, "max_source_positions": 2048, "max_target_positions": 8192, "model_type": "mapperatorinator", "n_fft": 1024, "n_mels": 80, "num_attention_heads": 12, "num_classes": 0, "num_hidden_layers": 12, "num_mappers": 5527, "pad_mode": "reflect", "pad_token_id": 0, "project_encoder_input": false, "rhythm_token_end": 3294, "rhythm_token_start": 17, "rhythm_weight": 1.0, "sample_rate": 16000, "spectrogram_implementation": "torchaudio", "spectrogram_log_scale": true, "top_k": 0, "torch_dtype": "float32", "transformers_version": "4.52.1", "vocab_size": 5971, "vocab_size_in": 11714 }