File size: 3,482 Bytes
144ed5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
{
  "dim": 3072,
  "n_layers": 26,
  "head_dim": 128,
  "hidden_dim": 9216,
  "n_heads": 32,
  "n_kv_heads": 8,
  "fp8_matmul": false,
  "use_biases": false,
  "causal": true,
  "rope_theta": 1000000.0,
  "norm_eps": 1e-05,
  "init": "NO_INIT",
  "dropout": 0.0,
  "vocab_size": 131072,
  "model_parallel": 1,
  "is_sequence_parallel": false,
  "context_parallel": 1,
  "tied_embeddings": true,
  "shard_on_vocab_dim": false,
  "model_pipelining": 1,
  "virtual_model_pipelining": 1,
  "fused_rms_norm": true,
  "checkpoint": false,
  "use_cache": false,
  "max_concurrent_tokens": 65536,
  "learnable_sinks": false,
  "rms_norm": "PRE",
  "cust_bwd": false,
  "recompute_w1_every": 0,
  "recompute_w3_every": 0,
  "recompute_attn_every": 0,
  "freeze_nonembedding": false,
  "fsdp2": true,
  "dp_replicate_size": 1,
  "zero2": true,
  "fsdp_optimize_backward_concat_if_pp": true,
  "attention_type": "FLASH_ATTN_3",
  "multimodal": {
    "bos_token_id": 1,
    "audio_model_args": {
      "semantic_codebook_size": 8192,
      "acoustic_codebook_size": 21,
      "n_acoustic_codebook": 36,
      "audio_encoding_args": {
        "codebook_pattern": "parallel",
        "interleave_audio_tokens_per_segment": 8192,
        "interleave_text_tokens_per_segment": 8192,
        "single_trailing_segment": false,
        "num_codebooks": 37,
        "sampling_rate": 24000,
        "frame_rate": 12.5
      },
      "audio_token_id": 24,
      "begin_audio_token_id": 25,
      "input_embedding_concat_type": "sum",
      "acoustic_transformer_args": {
        "input_dim": 3072,
        "dim": 3072,
        "n_layers": 3,
        "head_dim": 128,
        "hidden_dim": 9216,
        "n_heads": 32,
        "n_kv_heads": 8,
        "use_biases": false,
        "rope_theta": 10000.0,
        "sigma": 1e-05,
        "sigma_max": 1.0
      },
      "p_uncond": 0.0,
      "text_feature_bugged": false,
      "condition_dropped_token_id": 42
    },
    "audio_tokenizer_args": {
      "channels": 1,
      "sampling_rate": 24000,
      "pretransform_patch_size": 240,
      "patch_proj_kernel_size": 7,
      "semantic_codebook_size": 8192,
      "semantic_dim": 256,
      "acoustic_codebook_size": 21,
      "acoustic_dim": 36,
      "conv_weight_norm": true,
      "causal": true,
      "attn_sliding_window_size": 16,
      "half_attn_window_upon_downsampling": true,
      "dim": 1024,
      "hidden_dim": 4096,
      "head_dim": 128,
      "n_heads": 8,
      "n_kv_heads": 8,
      "qk_norm_eps": 1e-06,
      "qk_norm": true,
      "use_biases": false,
      "norm_eps": 0.01,
      "layer_scale": true,
      "layer_scale_init": 0.01,
      "decoder_transformer_lengths_str": "2,2,2,2",
      "decoder_convs_kernels_str": "3,4,4,4",
      "decoder_convs_strides_str": "1,2,2,2",
      "voice": {
        "casual_female": 0,
        "casual_male": 1,
        "cheerful_female": 2,
        "neutral_female": 3,
        "neutral_male": 4,
        "pt_male": 5,
        "pt_female": 6,
        "nl_male": 7,
        "nl_female": 8,
        "it_male": 9,
        "it_female": 10,
        "fr_male": 11,
        "fr_female": 12,
        "es_male": 13,
        "es_female": 14,
        "de_male": 15,
        "de_female": 16,
        "ar_male": 17,
        "hi_male": 18,
        "hi_female": 19
      }
    }
  },
  "torch_compile_swiglu_noncust_bwd": false,
  "override_parameters_str": "",
  "max_seq_len": 65536,
  "model_type": "voxtral_tts",
  "max_position_embeddings": 128000
}