Upload compiled artifacts from LFM2.5-Audio-1.5B

5c0d3d3 verified 8 days ago

7.37 kB

	{
	"model_name": "LFM2.5-Audio-1.5B",
	"model_type": "aud-lfm2-s2s",
	"vm_cfg": null,
	"mm_cfg": null,
	"lm_cfg": {
	"model_type": "lfm2",
	"data_type": "bfloat16",
	"arch": "lfm",
	"token_cfg": {
	"vocab_size": 65536
	},
	"rope_cfg": {
	"rope_theta": 1000000,
	"rope_local_base_freq": 1000000,
	"rope_scaling": {
	"factor": 1.0,
	"low_freq_factor": 0,
	"high_freq_factor": 0,
	"original_max_position_embeddings": 0,
	"long_factor": null,
	"short_factor": null,
	"rope_type": "default",
	"mrope_section": null,
	"mrope_interleaved": false
	}
	},
	"attn_cfg": {
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"head_dim": 64,
	"swa_enable": false,
	"sliding_window": 0,
	"attention_bias": false,
	"attention_dropout": 0.0,
	"query_pre_attn_scalar": 0
	},
	"mlp_cfg": {
	"intermediate_size": 8192,
	"act": "silu",
	"num_layers": 3,
	"mlp_bias": false
	},
	"hidden_size": 2048,
	"num_hidden_layers": 16,
	"max_position_embeddings": 2048,
	"rms_norm_eps": 1e-05,
	"rms_norm_unit_offset": false,
	"layer_types": [
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"full_attention",
	"conv",
	"full_attention",
	"conv",
	"full_attention",
	"conv"
	],
	"attn_logit_softcapping": null,
	"final_logit_softcapping": null,
	"lm_head_num_splits": 1,
	"lm_head_split_dim": 65536,
	"conv_L_cache": 3,
	"conv_bias": false,
	"lora_cfg": null
	},
	"pipeline_cfg": {
	"system_prompt": null,
	"chat_template": null,
	"max_num_tokens": 2048,
	"input_token_group_size": 128,
	"input_token_group_offsets": [
	0,
	128,
	256,
	384,
	512,
	640,
	768,
	896,
	1024,
	1152,
	1280,
	1408,
	1536,
	1664,
	1792,
	1920
	],
	"future_token_mask_size": 128,
	"return_logits": false,
	"use_strided_kv_cache": false,
	"enable_filter_sharing": false,
	"quantize_embeddings": false,
	"split_mlp": true
	},
	"audio_pipeline_cfg": {
	"codebooks": 8,
	"tie_audio_embeddings": false,
	"semantic_codebook_factor": 100,
	"codebook_weight": "log",
	"interleaved_n_text": 6,
	"interleaved_n_audio": 12
	},
	"audio_preprocessor_cfg": {
	"sample_rate": 16000,
	"normalize": "per_feature",
	"window_size": 0.025,
	"window_stride": 0.01,
	"window": "hann",
	"features": 128,
	"n_fft": 512,
	"log": true,
	"frame_splicing": 1,
	"dither": 1e-05,
	"pad_to": 0,
	"pad_value": 0.0
	},
	"audio_encoder_cfg": {
	"feat_in": 128,
	"feat_out": -1,
	"n_layers": 17,
	"d_model": 512,
	"subsampling": "dw_striding",
	"subsampling_factor": 8,
	"subsampling_conv_channels": 256,
	"causal_downsampling": false,
	"reduction": null,
	"reduction_position": null,
	"reduction_factor": 1,
	"ff_expansion_factor": 4,
	"self_attention_model": "rel_pos",
	"n_heads": 8,
	"att_context_size": [
	-1,
	-1
	],
	"xscaling": false,
	"untie_biases": true,
	"pos_emb_max_len": 5000,
	"conv_kernel_size": 9,
	"conv_norm_type": "batch_norm",
	"conv_context_size": null,
	"dropout": 0.1,
	"dropout_pre_encoder": 0.1,
	"dropout_emb": 0,
	"dropout_att": 0.1,
	"fixed_input_frames": 1024
	},
	"audio_depthformer_cfg": {
	"layers": 6,
	"dim": 1024,
	"tie": true,
	"proj_dim": 8192,
	"num_heads": 32,
	"num_key_value_heads": 8,
	"max_seq_len": 8,
	"vocab_size": 2049,
	"rope_theta": 1000000.0
	},
	"audio_detokenizer_cfg": {
	"lm_cfg": {
	"model_type": "lfm2",
	"data_type": "bfloat16",
	"arch": "lfm",
	"token_cfg": {
	"vocab_size": 65536
	},
	"rope_cfg": {
	"rope_theta": 1000000.0,
	"rope_local_base_freq": 10000,
	"rope_scaling": {
	"factor": 1.0,
	"low_freq_factor": 0,
	"high_freq_factor": 0,
	"original_max_position_embeddings": 0,
	"long_factor": null,
	"short_factor": null,
	"rope_type": "default",
	"mrope_section": null,
	"mrope_interleaved": false
	}
	},
	"attn_cfg": {
	"num_attention_heads": 16,
	"num_key_value_heads": 8,
	"head_dim": 32,
	"swa_enable": true,
	"sliding_window": 30,
	"attention_bias": false,
	"attention_dropout": 0.0,
	"query_pre_attn_scalar": 0
	},
	"mlp_cfg": {
	"intermediate_size": 3328,
	"act": "silu",
	"num_layers": 3,
	"mlp_bias": false
	},
	"hidden_size": 512,
	"num_hidden_layers": 8,
	"max_position_embeddings": 128000,
	"rms_norm_eps": 1e-05,
	"rms_norm_unit_offset": false,
	"layer_types": [
	"conv",
	"conv",
	"sliding_attention",
	"conv",
	"sliding_attention",
	"conv",
	"sliding_attention",
	"conv"
	],
	"attn_logit_softcapping": null,
	"final_logit_softcapping": null,
	"lm_head_num_splits": 1,
	"lm_head_split_dim": 0,
	"conv_L_cache": 3,
	"conv_bias": false,
	"lora_cfg": null
	},
	"output_size": 1282,
	"tokens_per_frame": 6,
	"cache_len_tokens": 30,
	"include_projection": true,
	"istft_cfg": {
	"n_fft": 1280,
	"hop_length": 320,
	"win_length": 1280,
	"window": "hann",
	"sample_rate": 24000,
	"padding": "same"
	}
	},
	"language_model_name": "LFM2.5-Audio-1.5B_language",
	"audio_encoder_model_name": "LFM2.5-Audio-1.5B_audio_encoder",
	"audio_depthformer_core_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_core",
	"audio_depthformer_head_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_head_cb",
	"audio_detokenizer_model_name": "LFM2.5-Audio-1.5B_audio_detokenizer"
	}