{ "architectures": [ "LongcatNextForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "audio_config": { "audio_head_transformer_dims": 3072, "audio_head_transformer_ffn_scale": 16, "audio_head_transformer_layers": 4, "audio_delim_token_id": 131116, "audio_end_token_id": 131104, "audio_pad_token_id": 131105, "audio_start_token_id": 131103, "audiogen_end_token_id": 131124, "audiogen_start_token_id": 131123, "audiotext_end_token_id": 131121, "audiotext_pad_token_id": 131122, "audiotext_start_token_id": 131120, "_attn_implementation": "flash_attention_2", "d_model": 1280, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_layers": 8, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_layers": 32, "num_mel_bins": 128, "avg_pooler": 4, "decoder_kernel_size": 3, "decoder_stride_size": 2, "hop_length": 160, "kernel_size": 3, "max_audio_seconds": 30, "n_fft": 400, "num_hidden_layers": 32, "sampling_rate": 16000, "stride_size": 2, "vq_config": { "codebook_sizes": [ 8192, 4096, 2048, 1024, 1024, 1024, 1024, 1024 ] }, "vocoder_config": { "channels": [ 256, 256, 256, 256, 256 ], "hop_length": 256, "num_mel_bins": 80, "sampling_rate": 16000 }, "flow_matching_config": { "in_channels": 80, "spk_emb_dim": 0, "diffusion_steps": 10, "cal_mel_mae": true, "prenet_activation_function": "gelu", "prenet_attention_heads": 8, "prenet_d_model": 512, "prenet_ffn_dim": 2048, "prenet_in_dim": 1280, "prenet_max_source_positions": 5000, "prenet_nlayers": 12, "prenet_out_dim": 80, "prenet_target_mel_length_scale_ratio": 1.0, "channels": [ 256 ], "dropout": 0.0, "attention_head_dim": 64, "n_blocks": 4, "num_heads": 8, "num_mid_blocks": 12, "act_fn": "gelu", "cfm_params": { "inference_cfg_rate": 0.7, "sigma_min": 1e-06, "solver": "euler", "t_scheduler": "cosine", "training_cfg_rate": 0.2 }, "use_hidden_states_before_dconv2": true }, "cosy24kvocoder_config": { "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/cosy24k_vocoder/hift.pt" } }, "audio_offset": 131125, "auto_map": { "AutoConfig": "configuration_longcat_next.LongcatNextConfig", "AutoModel": "modeling_longcat_next.LongcatNextModel", "AutoModelForCausalLM": "modeling_longcat_next.LongcatNextForCausalLM" }, "bos_token_id": 1, "emb_neighbor_num": 4, "emb_split_num": 4, "eos_token_id": 2, "expert_ffn_hidden_size": 1024, "ffn_hidden_size": 6144, "hidden_size": 3072, "kv_lora_rank": 512, "max_position_embeddings": 131072, "mla_scale_kv_lora": true, "mla_scale_q_lora": true, "model_type": "longcat_next", "moe_topk": 12, "n_routed_experts": 256, "ngram_vocab_size_ratio": 78, "num_attention_heads": 32, "num_layers": 14, "q_lora_rank": 1536, "qk_nope_head_dim": 128, "qk_rope_head_dim": 64, "quantization": { "group_size": 64, "bits": 4, "mode": "affine", "model.layers.0.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.1.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.2.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.3.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.4.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.5.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.6.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.7.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.8.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.9.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.10.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.11.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.12.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.13.mlp.router.classifier": { "group_size": 64, "bits": 8 } }, "quantization_config": { "group_size": 64, "bits": 4, "mode": "affine", "model.layers.0.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.1.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.2.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.3.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.4.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.5.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.6.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.7.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.8.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.9.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.10.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.11.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.12.mlp.router.classifier": { "group_size": 64, "bits": 8 }, "model.layers.13.mlp.router.classifier": { "group_size": 64, "bits": 8 } }, "rms_norm_eps": 1e-05, "rope_theta": 10000000, "routed_scaling_factor": 6.0, "text_vocab_plus_multimodal_special_token_size": 131125, "text_vocab_size": 131072, "torch_dtype": "bfloat16", "transformers_version": "4.57.6", "use_cache": true, "v_head_dim": 128, "visual_config": { "image_start_token_id": 131106, "image_end_token_id": 131107, "image_pad_token_id": 131108, "image_newline_token_id": 131109, "_attn_implementation": "flash_attention_2", "hidden_size": 1280, "image_head_transformer_dims": 2048, "image_head_transformer_ffn_scale": 16, "image_head_transformer_layers": 4, "vq_config": { "codebook_dim": 3584, "codebook_size": 16384, "codebook_sizes": [ 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 ], "decay": 0.99, "depth": 8, "commit_loss_ratio": 0.25, "entropy_loss_ratio": 0, "in_channels": 3584, "quant_conv": true, "quantizer_type": "rq", "restart_unused_codes": true, "shared_codebook": true, "vq_loss_ratio": 0 }, "visual_decoder_config": { "codebook_dim": 3584, "image_decoder_config": { "attention_dropout": 0.0, "codebook_dim": 3584, "distill_taps": [ 3, 7, 15, 23 ], "hidden_act": "gelu", "hidden_size": 1024, "intermediate_size": 2730, "k_bias": false, "layer_norm_eps": 1e-06, "num_attention_heads": 16, "num_hidden_layers": 32, "patch_size": 14, "q_bias": true, "spatial_merge_size": 2, "subln": true, "swiglu": true, "teacher_dims": { "15": 1280, "23": 1280, "3": 1280, "7": 1280 }, "temporal_patch_size": 2, "v_bias": true }, "transformer_config": { "patch_size": 2, "in_channels": 16, "hidden_size": 2520, "num_layers": 32, "num_refiner_layers": 2, "num_attention_heads": 21, "num_kv_heads": 7, "multiple_of": 256, "norm_eps": 1e-05, "axes_dim_rope": [ 40, 40, 40 ], "axes_lens": [ 10000, 10000, 10000 ], "text_feat_dim": 2048, "timestep_scale": 1000.0 }, "vae_config": { "act_fn": "silu", "block_out_channels": [ 128, 256, 512, 512 ], "down_block_types": [ "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D" ], "in_channels": 3, "latent_channels": 16, "layers_per_block": 2, "mid_block_add_attention": true, "norm_num_groups": 32, "out_channels": 3, "sample_size": 1024, "scaling_factor": 0.3611, "shift_factor": 0.1159, "up_block_types": [ "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D" ], "use_post_quant_conv": false, "use_quant_conv": false, "force_upcast": true }, "scheduler_config": { "num_train_timesteps": 1000, "dynamic_time_shift": true }, "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/image_decoder/image_decoder.safetensors" } }, "visual_embedding_layer_hidden_act": "silu", "visual_embedding_layer_intermediate_size": 8192, "visual_offset": 150581, "vocab_size": 282624, "zero_expert_num": 128, "zero_expert_type": "identity" }