majentik's picture
Add MLX quantized model
66e1e87 verified
{
"dim": 3072,
"n_layers": 26,
"head_dim": 128,
"hidden_dim": 9216,
"n_heads": 32,
"n_kv_heads": 8,
"fp8_matmul": false,
"use_biases": false,
"causal": true,
"rope_theta": 1000000.0,
"norm_eps": 1e-05,
"init": "NO_INIT",
"dropout": 0.0,
"vocab_size": 131072,
"model_parallel": 1,
"is_sequence_parallel": false,
"context_parallel": 1,
"tied_embeddings": true,
"shard_on_vocab_dim": false,
"model_pipelining": 1,
"virtual_model_pipelining": 1,
"fused_rms_norm": true,
"checkpoint": false,
"use_cache": false,
"max_concurrent_tokens": 65536,
"learnable_sinks": false,
"rms_norm": "PRE",
"cust_bwd": false,
"recompute_w1_every": 0,
"recompute_w3_every": 0,
"recompute_attn_every": 0,
"freeze_nonembedding": false,
"fsdp2": true,
"dp_replicate_size": 1,
"zero2": true,
"fsdp_optimize_backward_concat_if_pp": true,
"attention_type": "FLASH_ATTN_3",
"multimodal": {
"bos_token_id": 1,
"audio_model_args": {
"semantic_codebook_size": 8192,
"acoustic_codebook_size": 21,
"n_acoustic_codebook": 36,
"audio_encoding_args": {
"codebook_pattern": "parallel",
"interleave_audio_tokens_per_segment": 8192,
"interleave_text_tokens_per_segment": 8192,
"single_trailing_segment": false,
"num_codebooks": 37,
"sampling_rate": 24000,
"frame_rate": 12.5
},
"audio_token_id": 24,
"begin_audio_token_id": 25,
"input_embedding_concat_type": "sum",
"acoustic_transformer_args": {
"input_dim": 3072,
"dim": 3072,
"n_layers": 3,
"head_dim": 128,
"hidden_dim": 9216,
"n_heads": 32,
"n_kv_heads": 8,
"use_biases": false,
"rope_theta": 10000.0,
"sigma": 1e-05,
"sigma_max": 1.0
},
"p_uncond": 0.0,
"text_feature_bugged": false,
"condition_dropped_token_id": 42
},
"audio_tokenizer_args": {
"channels": 1,
"sampling_rate": 24000,
"pretransform_patch_size": 240,
"patch_proj_kernel_size": 7,
"semantic_codebook_size": 8192,
"semantic_dim": 256,
"acoustic_codebook_size": 21,
"acoustic_dim": 36,
"conv_weight_norm": true,
"causal": true,
"attn_sliding_window_size": 16,
"half_attn_window_upon_downsampling": true,
"dim": 1024,
"hidden_dim": 4096,
"head_dim": 128,
"n_heads": 8,
"n_kv_heads": 8,
"qk_norm_eps": 1e-06,
"qk_norm": true,
"use_biases": false,
"norm_eps": 0.01,
"layer_scale": true,
"layer_scale_init": 0.01,
"decoder_transformer_lengths_str": "2,2,2,2",
"decoder_convs_kernels_str": "3,4,4,4",
"decoder_convs_strides_str": "1,2,2,2",
"voice": {
"casual_female": 0,
"casual_male": 1,
"cheerful_female": 2,
"neutral_female": 3,
"neutral_male": 4,
"pt_male": 5,
"pt_female": 6,
"nl_male": 7,
"nl_female": 8,
"it_male": 9,
"it_female": 10,
"fr_male": 11,
"fr_female": 12,
"es_male": 13,
"es_female": 14,
"de_male": 15,
"de_female": 16,
"ar_male": 17,
"hi_male": 18,
"hi_female": 19
}
}
},
"torch_compile_swiglu_noncust_bwd": false,
"override_parameters_str": "",
"max_seq_len": 65536,
"model_type": "voxtral_tts",
"max_position_embeddings": 128000
}