gemma4-audio-encoder / config.json
rnagabh's picture
Initial upload: Gemma 4 audio encoder (304.8M USM-style Conformer)
8d87601 verified
{
"transformers_version": "5.5.0",
"architectures": [
"Gemma4AudioModel"
],
"output_hidden_states": false,
"return_dict": true,
"dtype": "bfloat16",
"chunk_size_feed_forward": 0,
"is_encoder_decoder": false,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"problem_type": null,
"hidden_size": 1024,
"num_hidden_layers": 12,
"num_attention_heads": 8,
"hidden_act": "silu",
"subsampling_conv_channels": [
128,
32
],
"conv_kernel_size": 5,
"residual_weight": 0.5,
"attention_chunk_size": 12,
"attention_context_left": 13,
"attention_context_right": 0,
"attention_logit_cap": 50.0,
"attention_invalid_logits_value": -1000000000.0,
"use_clipped_linears": true,
"rms_norm_eps": 1e-06,
"gradient_clipping": 10000000000.0,
"output_proj_dims": 1536,
"initializer_range": 0.02,
"_name_or_path": "",
"model_type": "gemma4_audio",
"output_attentions": false,
"torch_dtype": "bfloat16",
"_verified_total_params": 304824608,
"_verified_hidden_dim": 1024,
"_verified_output_dim": 1536,
"_verified_num_layers": 12,
"_verified_num_heads": 8,
"_verified_head_dim": 128,
"_verified_ffn_intermediate": 4096,
"_verified_conv_kernel": 5,
"_verified_subsampling_channels": [
128,
32
],
"_verified_subsampling_norm": "LayerNorm",
"_verified_subsampling_activation": "ReLU",
"_verified_conformer_activation": "SiLU",
"_verified_conformer_norm": "RMSNorm",
"_verified_conformer_norm_eps": 1e-06,
"_verified_temporal_downsample": 4,
"_source_model": "google/gemma-4-E2B-it",
"_extraction_note": "Audio tower weights are identical between E2B and E4B variants"
}