florianvoss's picture
Upload compiled artifacts from LFM2.5-Audio-1.5B
5c0d3d3 verified
{
"model_name": "LFM2.5-Audio-1.5B",
"model_type": "aud-lfm2-s2s",
"vm_cfg": null,
"mm_cfg": null,
"lm_cfg": {
"model_type": "lfm2",
"data_type": "bfloat16",
"arch": "lfm",
"token_cfg": {
"vocab_size": 65536
},
"rope_cfg": {
"rope_theta": 1000000,
"rope_local_base_freq": 1000000,
"rope_scaling": {
"factor": 1.0,
"low_freq_factor": 0,
"high_freq_factor": 0,
"original_max_position_embeddings": 0,
"long_factor": null,
"short_factor": null,
"rope_type": "default",
"mrope_section": null,
"mrope_interleaved": false
}
},
"attn_cfg": {
"num_attention_heads": 32,
"num_key_value_heads": 8,
"head_dim": 64,
"swa_enable": false,
"sliding_window": 0,
"attention_bias": false,
"attention_dropout": 0.0,
"query_pre_attn_scalar": 0
},
"mlp_cfg": {
"intermediate_size": 8192,
"act": "silu",
"num_layers": 3,
"mlp_bias": false
},
"hidden_size": 2048,
"num_hidden_layers": 16,
"max_position_embeddings": 2048,
"rms_norm_eps": 1e-05,
"rms_norm_unit_offset": false,
"layer_types": [
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv"
],
"attn_logit_softcapping": null,
"final_logit_softcapping": null,
"lm_head_num_splits": 1,
"lm_head_split_dim": 65536,
"conv_L_cache": 3,
"conv_bias": false,
"lora_cfg": null
},
"pipeline_cfg": {
"system_prompt": null,
"chat_template": null,
"max_num_tokens": 2048,
"input_token_group_size": 128,
"input_token_group_offsets": [
0,
128,
256,
384,
512,
640,
768,
896,
1024,
1152,
1280,
1408,
1536,
1664,
1792,
1920
],
"future_token_mask_size": 128,
"return_logits": false,
"use_strided_kv_cache": false,
"enable_filter_sharing": false,
"quantize_embeddings": false,
"split_mlp": true
},
"audio_pipeline_cfg": {
"codebooks": 8,
"tie_audio_embeddings": false,
"semantic_codebook_factor": 100,
"codebook_weight": "log",
"interleaved_n_text": 6,
"interleaved_n_audio": 12
},
"audio_preprocessor_cfg": {
"sample_rate": 16000,
"normalize": "per_feature",
"window_size": 0.025,
"window_stride": 0.01,
"window": "hann",
"features": 128,
"n_fft": 512,
"log": true,
"frame_splicing": 1,
"dither": 1e-05,
"pad_to": 0,
"pad_value": 0.0
},
"audio_encoder_cfg": {
"feat_in": 128,
"feat_out": -1,
"n_layers": 17,
"d_model": 512,
"subsampling": "dw_striding",
"subsampling_factor": 8,
"subsampling_conv_channels": 256,
"causal_downsampling": false,
"reduction": null,
"reduction_position": null,
"reduction_factor": 1,
"ff_expansion_factor": 4,
"self_attention_model": "rel_pos",
"n_heads": 8,
"att_context_size": [
-1,
-1
],
"xscaling": false,
"untie_biases": true,
"pos_emb_max_len": 5000,
"conv_kernel_size": 9,
"conv_norm_type": "batch_norm",
"conv_context_size": null,
"dropout": 0.1,
"dropout_pre_encoder": 0.1,
"dropout_emb": 0,
"dropout_att": 0.1,
"fixed_input_frames": 1024
},
"audio_depthformer_cfg": {
"layers": 6,
"dim": 1024,
"tie": true,
"proj_dim": 8192,
"num_heads": 32,
"num_key_value_heads": 8,
"max_seq_len": 8,
"vocab_size": 2049,
"rope_theta": 1000000.0
},
"audio_detokenizer_cfg": {
"lm_cfg": {
"model_type": "lfm2",
"data_type": "bfloat16",
"arch": "lfm",
"token_cfg": {
"vocab_size": 65536
},
"rope_cfg": {
"rope_theta": 1000000.0,
"rope_local_base_freq": 10000,
"rope_scaling": {
"factor": 1.0,
"low_freq_factor": 0,
"high_freq_factor": 0,
"original_max_position_embeddings": 0,
"long_factor": null,
"short_factor": null,
"rope_type": "default",
"mrope_section": null,
"mrope_interleaved": false
}
},
"attn_cfg": {
"num_attention_heads": 16,
"num_key_value_heads": 8,
"head_dim": 32,
"swa_enable": true,
"sliding_window": 30,
"attention_bias": false,
"attention_dropout": 0.0,
"query_pre_attn_scalar": 0
},
"mlp_cfg": {
"intermediate_size": 3328,
"act": "silu",
"num_layers": 3,
"mlp_bias": false
},
"hidden_size": 512,
"num_hidden_layers": 8,
"max_position_embeddings": 128000,
"rms_norm_eps": 1e-05,
"rms_norm_unit_offset": false,
"layer_types": [
"conv",
"conv",
"sliding_attention",
"conv",
"sliding_attention",
"conv",
"sliding_attention",
"conv"
],
"attn_logit_softcapping": null,
"final_logit_softcapping": null,
"lm_head_num_splits": 1,
"lm_head_split_dim": 0,
"conv_L_cache": 3,
"conv_bias": false,
"lora_cfg": null
},
"output_size": 1282,
"tokens_per_frame": 6,
"cache_len_tokens": 30,
"include_projection": true,
"istft_cfg": {
"n_fft": 1280,
"hop_length": 320,
"win_length": 1280,
"window": "hann",
"sample_rate": 24000,
"padding": "same"
}
},
"language_model_name": "LFM2.5-Audio-1.5B_language",
"audio_encoder_model_name": "LFM2.5-Audio-1.5B_audio_encoder",
"audio_depthformer_core_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_core",
"audio_depthformer_head_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_head_cb",
"audio_detokenizer_model_name": "LFM2.5-Audio-1.5B_audio_detokenizer"
}