| { |
| "model_type": "voxtral_realtime", |
| "decoder": { |
| "dim": 3072, |
| "n_layers": 26, |
| "head_dim": 128, |
| "hidden_dim": 9216, |
| "n_heads": 32, |
| "n_kv_heads": 8, |
| "vocab_size": 131072, |
| "norm_eps": 1e-05, |
| "rope_theta": 1000000.0, |
| "sliding_window": 8192, |
| "tied_embeddings": true, |
| "ada_rms_norm_t_cond": true, |
| "ada_rms_norm_t_cond_dim": 32 |
| }, |
| "encoder_args": { |
| "audio_encoding_args": { |
| "sampling_rate": 16000, |
| "frame_rate": 12.5, |
| "num_mel_bins": 128, |
| "hop_length": 160, |
| "window_size": 400, |
| "chunk_length_s": null, |
| "global_log_mel_max": 1.5, |
| "transcription_format": "streaming" |
| }, |
| "dim": 1280, |
| "n_layers": 32, |
| "head_dim": 64, |
| "hidden_dim": 5120, |
| "n_heads": 32, |
| "vocab_size": 131072, |
| "n_kv_heads": 32, |
| "use_biases": true, |
| "use_cache": false, |
| "rope_theta": 1000000.0, |
| "causal": true, |
| "norm_eps": 1e-05, |
| "pos_embed": "rope", |
| "max_source_positions": null, |
| "ffn_type": "swiglu", |
| "norm_type": "rms_norm", |
| "sliding_window": 750, |
| "downsample_factor": 4 |
| }, |
| "quantization_config": { |
| "bits": 8, |
| "group_size": 64 |
| } |
| } |