File size: 3,482 Bytes
144ed5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | {
"dim": 3072,
"n_layers": 26,
"head_dim": 128,
"hidden_dim": 9216,
"n_heads": 32,
"n_kv_heads": 8,
"fp8_matmul": false,
"use_biases": false,
"causal": true,
"rope_theta": 1000000.0,
"norm_eps": 1e-05,
"init": "NO_INIT",
"dropout": 0.0,
"vocab_size": 131072,
"model_parallel": 1,
"is_sequence_parallel": false,
"context_parallel": 1,
"tied_embeddings": true,
"shard_on_vocab_dim": false,
"model_pipelining": 1,
"virtual_model_pipelining": 1,
"fused_rms_norm": true,
"checkpoint": false,
"use_cache": false,
"max_concurrent_tokens": 65536,
"learnable_sinks": false,
"rms_norm": "PRE",
"cust_bwd": false,
"recompute_w1_every": 0,
"recompute_w3_every": 0,
"recompute_attn_every": 0,
"freeze_nonembedding": false,
"fsdp2": true,
"dp_replicate_size": 1,
"zero2": true,
"fsdp_optimize_backward_concat_if_pp": true,
"attention_type": "FLASH_ATTN_3",
"multimodal": {
"bos_token_id": 1,
"audio_model_args": {
"semantic_codebook_size": 8192,
"acoustic_codebook_size": 21,
"n_acoustic_codebook": 36,
"audio_encoding_args": {
"codebook_pattern": "parallel",
"interleave_audio_tokens_per_segment": 8192,
"interleave_text_tokens_per_segment": 8192,
"single_trailing_segment": false,
"num_codebooks": 37,
"sampling_rate": 24000,
"frame_rate": 12.5
},
"audio_token_id": 24,
"begin_audio_token_id": 25,
"input_embedding_concat_type": "sum",
"acoustic_transformer_args": {
"input_dim": 3072,
"dim": 3072,
"n_layers": 3,
"head_dim": 128,
"hidden_dim": 9216,
"n_heads": 32,
"n_kv_heads": 8,
"use_biases": false,
"rope_theta": 10000.0,
"sigma": 1e-05,
"sigma_max": 1.0
},
"p_uncond": 0.0,
"text_feature_bugged": false,
"condition_dropped_token_id": 42
},
"audio_tokenizer_args": {
"channels": 1,
"sampling_rate": 24000,
"pretransform_patch_size": 240,
"patch_proj_kernel_size": 7,
"semantic_codebook_size": 8192,
"semantic_dim": 256,
"acoustic_codebook_size": 21,
"acoustic_dim": 36,
"conv_weight_norm": true,
"causal": true,
"attn_sliding_window_size": 16,
"half_attn_window_upon_downsampling": true,
"dim": 1024,
"hidden_dim": 4096,
"head_dim": 128,
"n_heads": 8,
"n_kv_heads": 8,
"qk_norm_eps": 1e-06,
"qk_norm": true,
"use_biases": false,
"norm_eps": 0.01,
"layer_scale": true,
"layer_scale_init": 0.01,
"decoder_transformer_lengths_str": "2,2,2,2",
"decoder_convs_kernels_str": "3,4,4,4",
"decoder_convs_strides_str": "1,2,2,2",
"voice": {
"casual_female": 0,
"casual_male": 1,
"cheerful_female": 2,
"neutral_female": 3,
"neutral_male": 4,
"pt_male": 5,
"pt_female": 6,
"nl_male": 7,
"nl_female": 8,
"it_male": 9,
"it_female": 10,
"fr_male": 11,
"fr_female": 12,
"es_male": 13,
"es_female": 14,
"de_male": 15,
"de_female": 16,
"ar_male": 17,
"hi_male": 18,
"hi_female": 19
}
}
},
"torch_compile_swiglu_noncust_bwd": false,
"override_parameters_str": "",
"max_seq_len": 65536,
"model_type": "voxtral_tts",
"max_position_embeddings": 128000
} |