File size: 7,365 Bytes

5c0d3d3

{
    "model_name": "LFM2.5-Audio-1.5B",
    "model_type": "aud-lfm2-s2s",
    "vm_cfg": null,
    "mm_cfg": null,
    "lm_cfg": {
        "model_type": "lfm2",
        "data_type": "bfloat16",
        "arch": "lfm",
        "token_cfg": {
            "vocab_size": 65536
        },
        "rope_cfg": {
            "rope_theta": 1000000,
            "rope_local_base_freq": 1000000,
            "rope_scaling": {
                "factor": 1.0,
                "low_freq_factor": 0,
                "high_freq_factor": 0,
                "original_max_position_embeddings": 0,
                "long_factor": null,
                "short_factor": null,
                "rope_type": "default",
                "mrope_section": null,
                "mrope_interleaved": false
            }
        },
        "attn_cfg": {
            "num_attention_heads": 32,
            "num_key_value_heads": 8,
            "head_dim": 64,
            "swa_enable": false,
            "sliding_window": 0,
            "attention_bias": false,
            "attention_dropout": 0.0,
            "query_pre_attn_scalar": 0
        },
        "mlp_cfg": {
            "intermediate_size": 8192,
            "act": "silu",
            "num_layers": 3,
            "mlp_bias": false
        },
        "hidden_size": 2048,
        "num_hidden_layers": 16,
        "max_position_embeddings": 2048,
        "rms_norm_eps": 1e-05,
        "rms_norm_unit_offset": false,
        "layer_types": [
            "conv",
            "conv",
            "full_attention",
            "conv",
            "conv",
            "full_attention",
            "conv",
            "conv",
            "full_attention",
            "conv",
            "full_attention",
            "conv",
            "full_attention",
            "conv",
            "full_attention",
            "conv"
        ],
        "attn_logit_softcapping": null,
        "final_logit_softcapping": null,
        "lm_head_num_splits": 1,
        "lm_head_split_dim": 65536,
        "conv_L_cache": 3,
        "conv_bias": false,
        "lora_cfg": null
    },
    "pipeline_cfg": {
        "system_prompt": null,
        "chat_template": null,
        "max_num_tokens": 2048,
        "input_token_group_size": 128,
        "input_token_group_offsets": [
            0,
            128,
            256,
            384,
            512,
            640,
            768,
            896,
            1024,
            1152,
            1280,
            1408,
            1536,
            1664,
            1792,
            1920
        ],
        "future_token_mask_size": 128,
        "return_logits": false,
        "use_strided_kv_cache": false,
        "enable_filter_sharing": false,
        "quantize_embeddings": false,
        "split_mlp": true
    },
    "audio_pipeline_cfg": {
        "codebooks": 8,
        "tie_audio_embeddings": false,
        "semantic_codebook_factor": 100,
        "codebook_weight": "log",
        "interleaved_n_text": 6,
        "interleaved_n_audio": 12
    },
    "audio_preprocessor_cfg": {
        "sample_rate": 16000,
        "normalize": "per_feature",
        "window_size": 0.025,
        "window_stride": 0.01,
        "window": "hann",
        "features": 128,
        "n_fft": 512,
        "log": true,
        "frame_splicing": 1,
        "dither": 1e-05,
        "pad_to": 0,
        "pad_value": 0.0
    },
    "audio_encoder_cfg": {
        "feat_in": 128,
        "feat_out": -1,
        "n_layers": 17,
        "d_model": 512,
        "subsampling": "dw_striding",
        "subsampling_factor": 8,
        "subsampling_conv_channels": 256,
        "causal_downsampling": false,
        "reduction": null,
        "reduction_position": null,
        "reduction_factor": 1,
        "ff_expansion_factor": 4,
        "self_attention_model": "rel_pos",
        "n_heads": 8,
        "att_context_size": [
            -1,
            -1
        ],
        "xscaling": false,
        "untie_biases": true,
        "pos_emb_max_len": 5000,
        "conv_kernel_size": 9,
        "conv_norm_type": "batch_norm",
        "conv_context_size": null,
        "dropout": 0.1,
        "dropout_pre_encoder": 0.1,
        "dropout_emb": 0,
        "dropout_att": 0.1,
        "fixed_input_frames": 1024
    },
    "audio_depthformer_cfg": {
        "layers": 6,
        "dim": 1024,
        "tie": true,
        "proj_dim": 8192,
        "num_heads": 32,
        "num_key_value_heads": 8,
        "max_seq_len": 8,
        "vocab_size": 2049,
        "rope_theta": 1000000.0
    },
    "audio_detokenizer_cfg": {
        "lm_cfg": {
            "model_type": "lfm2",
            "data_type": "bfloat16",
            "arch": "lfm",
            "token_cfg": {
                "vocab_size": 65536
            },
            "rope_cfg": {
                "rope_theta": 1000000.0,
                "rope_local_base_freq": 10000,
                "rope_scaling": {
                    "factor": 1.0,
                    "low_freq_factor": 0,
                    "high_freq_factor": 0,
                    "original_max_position_embeddings": 0,
                    "long_factor": null,
                    "short_factor": null,
                    "rope_type": "default",
                    "mrope_section": null,
                    "mrope_interleaved": false
                }
            },
            "attn_cfg": {
                "num_attention_heads": 16,
                "num_key_value_heads": 8,
                "head_dim": 32,
                "swa_enable": true,
                "sliding_window": 30,
                "attention_bias": false,
                "attention_dropout": 0.0,
                "query_pre_attn_scalar": 0
            },
            "mlp_cfg": {
                "intermediate_size": 3328,
                "act": "silu",
                "num_layers": 3,
                "mlp_bias": false
            },
            "hidden_size": 512,
            "num_hidden_layers": 8,
            "max_position_embeddings": 128000,
            "rms_norm_eps": 1e-05,
            "rms_norm_unit_offset": false,
            "layer_types": [
                "conv",
                "conv",
                "sliding_attention",
                "conv",
                "sliding_attention",
                "conv",
                "sliding_attention",
                "conv"
            ],
            "attn_logit_softcapping": null,
            "final_logit_softcapping": null,
            "lm_head_num_splits": 1,
            "lm_head_split_dim": 0,
            "conv_L_cache": 3,
            "conv_bias": false,
            "lora_cfg": null
        },
        "output_size": 1282,
        "tokens_per_frame": 6,
        "cache_len_tokens": 30,
        "include_projection": true,
        "istft_cfg": {
            "n_fft": 1280,
            "hop_length": 320,
            "win_length": 1280,
            "window": "hann",
            "sample_rate": 24000,
            "padding": "same"
        }
    },
    "language_model_name": "LFM2.5-Audio-1.5B_language",
    "audio_encoder_model_name": "LFM2.5-Audio-1.5B_audio_encoder",
    "audio_depthformer_core_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_core",
    "audio_depthformer_head_model_name": "LFM2.5-Audio-1.5B_audio_depthformer_head_cb",
    "audio_detokenizer_model_name": "LFM2.5-Audio-1.5B_audio_detokenizer"
}