{ "architectures": [ "Qwen3AudioWrappedForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "audio_adapter_configs": [ { "adapter_embedding_dim": 1024, "adapter_name": "downsampler_conformer", "audio_encoder_layers": [ 7, 15, 23, 31 ], "downsampler_depth": 1, "encoder_embedding_dim": 1280, "encoder_name": "whisper", "layer_fusion_config": { "layer_fusion_type": "weighted_average" }, "llm_embedding_dim": 2560, "norm_type": "batch", "pre_average": true, "use_conformer": false }, { "adapter_embedding_dim": 1024, "adapter_name": "identity", "audio_encoder_layers": [ 3, 7, 11 ], "encoder_name": "sslam", "layer_fusion_config": { "layer_fusion_type": "weighted_average" }, "llm_embedding_dim": 2560, "pre_average": true, "use_llm_proj": false }, { "adapter_embedding_dim": 1024, "adapter_name": "identity", "audio_encoder_layers": [ 4, 8, 12 ], "encoder_name": "muq", "layer_fusion_config": { "layer_fusion_type": "weighted_average" }, "llm_embedding_dim": 2560, "pre_average": true, "use_llm_proj": false }, { "adapter_embedding_dim": 1024, "adapter_name": "identity", "audio_encoder_layers": [ 6, 10, 15, 20, 24 ], "encoder_name": "w2vbert", "layer_fusion_config": { "layer_fusion_type": "weighted_average" }, "llm_embedding_dim": 2560, "pre_average": true, "use_llm_proj": false } ], "audio_encoder_configs": [ { "encoder_name": "whisper" }, { "encoder_name": "sslam" }, { "encoder_name": "muq" }, { "encoder_name": "w2vbert" } ], "audio_fusion_config": { "conditional_embedding_dim": [ 1024, 1024, 768 ], "conditional_encoders": [ "w2vbert", "muq", "sslam" ], "first_cross_attention_layer_shared": [ true, true, true ], "first_self_attention_block_shared": [ true, true, true ], "fusion_type": "multiperceiver", "llm_embedding_dim": 2560, "main_encoder": "whisper", "num_cross_attention_heads": [ 4, 4, 4 ], "num_cross_attention_layers": [ 1, 1, 1 ], "num_latent_channels": [ 768, 768, 768 ], "num_latents": [ 20, 20, 20 ], "num_self_attention_blocks": [ 3, 3, 3 ], "num_self_attention_heads": [ 4, 4, 4 ], "num_self_attention_layers_per_block": [ 6, 6, 6 ] }, "audio_postprocessing_config": { "postprocessing_type": "identity" }, "audio_sep_d_embed": 2560, "bos_token_id": 151643, "dtype": "float32", "eos_token_id": 151645, "head_dim": 128, "hidden_act": "silu", "hidden_size": 2560, "initializer_range": 0.02, "intermediate_size": 9728, "layer_types": [ "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention", "full_attention" ], "max_position_embeddings": 262144, "max_window_layers": 36, "model_type": "qwen3_audio", "num_attention_heads": 32, "num_hidden_layers": 36, "num_key_value_heads": 8, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 5000000, "sliding_window": null, "tie_word_embeddings": true, "transformers_version": "4.57.1", "use_cache": false, "use_explicit_audio_tokens": false, "use_sliding_window": false, "vocab_size": 151936 }