| { |
| "dim": 3072, |
| "n_layers": 26, |
| "head_dim": 128, |
| "hidden_dim": 9216, |
| "n_heads": 32, |
| "n_kv_heads": 8, |
| "fp8_matmul": false, |
| "use_biases": false, |
| "causal": true, |
| "rope_theta": 1000000.0, |
| "norm_eps": 1e-05, |
| "init": "NO_INIT", |
| "dropout": 0.0, |
| "vocab_size": 131072, |
| "model_parallel": 1, |
| "is_sequence_parallel": false, |
| "context_parallel": 1, |
| "tied_embeddings": true, |
| "shard_on_vocab_dim": false, |
| "model_pipelining": 1, |
| "virtual_model_pipelining": 1, |
| "fused_rms_norm": true, |
| "checkpoint": false, |
| "use_cache": false, |
| "max_concurrent_tokens": 65536, |
| "learnable_sinks": false, |
| "rms_norm": "PRE", |
| "cust_bwd": false, |
| "recompute_w1_every": 0, |
| "recompute_w3_every": 0, |
| "recompute_attn_every": 0, |
| "freeze_nonembedding": false, |
| "fsdp2": true, |
| "dp_replicate_size": 1, |
| "zero2": true, |
| "fsdp_optimize_backward_concat_if_pp": true, |
| "attention_type": "FLASH_ATTN_3", |
| "multimodal": { |
| "bos_token_id": 1, |
| "audio_model_args": { |
| "semantic_codebook_size": 8192, |
| "acoustic_codebook_size": 21, |
| "n_acoustic_codebook": 36, |
| "audio_encoding_args": { |
| "codebook_pattern": "parallel", |
| "interleave_audio_tokens_per_segment": 8192, |
| "interleave_text_tokens_per_segment": 8192, |
| "single_trailing_segment": false, |
| "num_codebooks": 37, |
| "sampling_rate": 24000, |
| "frame_rate": 12.5 |
| }, |
| "audio_token_id": 24, |
| "begin_audio_token_id": 25, |
| "input_embedding_concat_type": "sum", |
| "acoustic_transformer_args": { |
| "input_dim": 3072, |
| "dim": 3072, |
| "n_layers": 3, |
| "head_dim": 128, |
| "hidden_dim": 9216, |
| "n_heads": 32, |
| "n_kv_heads": 8, |
| "use_biases": false, |
| "rope_theta": 10000.0, |
| "sigma": 1e-05, |
| "sigma_max": 1.0 |
| }, |
| "p_uncond": 0.0, |
| "text_feature_bugged": false, |
| "condition_dropped_token_id": 42 |
| }, |
| "audio_tokenizer_args": { |
| "channels": 1, |
| "sampling_rate": 24000, |
| "pretransform_patch_size": 240, |
| "patch_proj_kernel_size": 7, |
| "semantic_codebook_size": 8192, |
| "semantic_dim": 256, |
| "acoustic_codebook_size": 21, |
| "acoustic_dim": 36, |
| "conv_weight_norm": true, |
| "causal": true, |
| "attn_sliding_window_size": 16, |
| "half_attn_window_upon_downsampling": true, |
| "dim": 1024, |
| "hidden_dim": 4096, |
| "head_dim": 128, |
| "n_heads": 8, |
| "n_kv_heads": 8, |
| "qk_norm_eps": 1e-06, |
| "qk_norm": true, |
| "use_biases": false, |
| "norm_eps": 0.01, |
| "layer_scale": true, |
| "layer_scale_init": 0.01, |
| "decoder_transformer_lengths_str": "2,2,2,2", |
| "decoder_convs_kernels_str": "3,4,4,4", |
| "decoder_convs_strides_str": "1,2,2,2", |
| "voice": { |
| "casual_female": 0, |
| "casual_male": 1, |
| "cheerful_female": 2, |
| "neutral_female": 3, |
| "neutral_male": 4, |
| "pt_male": 5, |
| "pt_female": 6, |
| "nl_male": 7, |
| "nl_female": 8, |
| "it_male": 9, |
| "it_female": 10, |
| "fr_male": 11, |
| "fr_female": 12, |
| "es_male": 13, |
| "es_female": 14, |
| "de_male": 15, |
| "de_female": 16, |
| "ar_male": 17, |
| "hi_male": 18, |
| "hi_female": 19 |
| } |
| } |
| }, |
| "torch_compile_swiglu_noncust_bwd": false, |
| "override_parameters_str": "", |
| "max_seq_len": 65536, |
| "model_type": "voxtral_tts", |
| "max_position_embeddings": 128000 |
| } |