{ "architectures": [ "ClapModel" ], "audio_config": { "_attn_implementation_autoset": true, "depths": [ 1, 1 ], "hidden_size": 192, "intermediate_size": 37, "model_type": "clap_audio_model", "num_attention_heads": [ 1, 1 ], "num_hidden_layers": 1, "num_mel_bins": 8, "patch_embed_input_channels": 4, "patch_size": 2, "patch_stride": 2, "projection_dim": 16, "spec_size": 8, "window_size": 4 }, "hidden_size": 8, "initializer_factor": 1.0, "logit_scale_init_value": 14.285714285714285, "model_type": "clap", "num_hidden_layers": 3, "projection_dim": 16, "projection_hidden_act": "relu", "text_config": { "_attn_implementation_autoset": true, "hidden_size": 8, "intermediate_size": 37, "layer_norm_eps": 1e-05, "model_type": "clap_text_model", "num_attention_heads": 1, "num_hidden_layers": 1, "projection_dim": 16, "vocab_size": 1000 }, "torch_dtype": "float32", "transformers_version": "4.50.0.dev0" }