{ "dim": 3072, "n_layers": 26, "head_dim": 128, "hidden_dim": 9216, "n_heads": 32, "n_kv_heads": 8, "fp8_matmul": false, "use_biases": false, "causal": true, "rope_theta": 1000000.0, "norm_eps": 1e-05, "init": "NO_INIT", "dropout": 0.0, "vocab_size": 131072, "model_parallel": 1, "is_sequence_parallel": false, "context_parallel": 1, "tied_embeddings": true, "shard_on_vocab_dim": false, "model_pipelining": 1, "virtual_model_pipelining": 1, "fused_rms_norm": true, "checkpoint": false, "use_cache": false, "max_concurrent_tokens": 65536, "learnable_sinks": false, "rms_norm": "PRE", "cust_bwd": false, "recompute_w1_every": 0, "recompute_w3_every": 0, "recompute_attn_every": 0, "freeze_nonembedding": false, "fsdp2": true, "dp_replicate_size": 1, "zero2": true, "fsdp_optimize_backward_concat_if_pp": true, "attention_type": "FLASH_ATTN_3", "multimodal": { "bos_token_id": 1, "audio_model_args": { "semantic_codebook_size": 8192, "acoustic_codebook_size": 21, "n_acoustic_codebook": 36, "audio_encoding_args": { "codebook_pattern": "parallel", "interleave_audio_tokens_per_segment": 8192, "interleave_text_tokens_per_segment": 8192, "single_trailing_segment": false, "num_codebooks": 37, "sampling_rate": 24000, "frame_rate": 12.5 }, "audio_token_id": 24, "begin_audio_token_id": 25, "input_embedding_concat_type": "sum", "acoustic_transformer_args": { "input_dim": 3072, "dim": 3072, "n_layers": 3, "head_dim": 128, "hidden_dim": 9216, "n_heads": 32, "n_kv_heads": 8, "use_biases": false, "rope_theta": 10000.0, "sigma": 1e-05, "sigma_max": 1.0 }, "p_uncond": 0.0, "text_feature_bugged": false, "condition_dropped_token_id": 42 }, "audio_tokenizer_args": { "channels": 1, "sampling_rate": 24000, "pretransform_patch_size": 240, "patch_proj_kernel_size": 7, "semantic_codebook_size": 8192, "semantic_dim": 256, "acoustic_codebook_size": 21, "acoustic_dim": 36, "conv_weight_norm": true, "causal": true, "attn_sliding_window_size": 16, "half_attn_window_upon_downsampling": true, "dim": 1024, "hidden_dim": 4096, "head_dim": 128, "n_heads": 8, "n_kv_heads": 8, "qk_norm_eps": 1e-06, "qk_norm": true, "use_biases": false, "norm_eps": 0.01, "layer_scale": true, "layer_scale_init": 0.01, "decoder_transformer_lengths_str": "2,2,2,2", "decoder_convs_kernels_str": "3,4,4,4", "decoder_convs_strides_str": "1,2,2,2", "voice": { "casual_female": 0, "casual_male": 1, "cheerful_female": 2, "neutral_female": 3, "neutral_male": 4, "pt_male": 5, "pt_female": 6, "nl_male": 7, "nl_female": 8, "it_male": 9, "it_female": 10, "fr_male": 11, "fr_female": 12, "es_male": 13, "es_female": 14, "de_male": 15, "de_female": 16, "ar_male": 17, "hi_male": 18, "hi_female": 19 } } }, "torch_compile_swiglu_noncust_bwd": false, "override_parameters_str": "", "max_seq_len": 65536, "model_type": "voxtral_tts", "max_position_embeddings": 128000 }