{
    "ada_rms_norm_t_cond": true,
    "ada_rms_norm_t_cond_dim": 32,
    "causal": true,
    "dim": 3072,
    "head_dim": 128,
    "hidden_dim": 9216,
    "model_max_length": 131072,
    "model_parallel": 1,
    "multimodal": {
        "whisper_model_args": {
            "encoder_args": {
                "audio_encoding_args": {
                    "sampling_rate": 16000,
                    "frame_rate": 12.5,
                    "num_mel_bins": 128,
                    "hop_length": 160,
                    "window_size": 400,
                    "chunk_length_s": null,
                    "global_log_mel_max": 1.5,
                    "transcription_format": "streaming"
                },
                "dim": 1280,
                "n_layers": 32,
                "head_dim": 64,
                "hidden_dim": 5120,
                "n_heads": 32,
                "vocab_size": 131072,
                "n_kv_heads": 32,
                "use_biases": true,
                "use_cache": false,
                "rope_theta": 1000000.0,
                "causal": true,
                "norm_eps": 1e-05,
                "pos_embed": "rope",
                "max_source_positions": null,
                "ffn_type": "swiglu",
                "norm_type": "rms_norm",
                "sliding_window": 750
            },
            "downsample_args": {
                "downsample_factor": 4
            }
        }
    },
    "n_heads": 32,
    "n_kv_heads": 8,
    "n_layers": 26,
    "norm_eps": 1e-05,
    "quantization": {
        "group_size": 64,
        "bits": 6
    },
    "rope_theta": 1000000.0,
    "sliding_window": 8192,
    "tied_embeddings": true,
    "use_biases": false,
    "vocab_size": 131072
}