{ "architectures": [ "LongcatNextForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "audio_config": { "activation_dropout": 0.0, "activation_function": "gelu", "apply_spec_augment": false, "attention_dropout": 0.0, "audio_delim_token_id": 131116, "audio_end_token_id": 131104, "audio_head_transformer_dims": 3072, "audio_head_transformer_ffn_scale": 16, "audio_head_transformer_layers": 4, "audio_pad_token_id": 131105, "audio_start_token_id": 131103, "audiogen_end_token_id": 131124, "audiogen_start_token_id": 131123, "audiotext_end_token_id": 131121, "audiotext_pad_token_id": 131122, "audiotext_start_token_id": 131120, "avg_pooler": 4, "classifier_proj_size": 256, "cosy24kvocoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/cosy24k_vocoder/hift.pt" }, "d_model": 1280, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_kernel_size": 3, "decoder_layerdrop": 0.0, "decoder_layers": 8, "decoder_stride_size": 2, "dropout": 0.0, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_layerdrop": 0.0, "encoder_layers": 32, "flow_matching_config": { "_name_or_path": "", "act_fn": "gelu", "add_cross_attention": false, "architectures": null, "attention_head_dim": 64, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "cal_mel_mae": true, "cfm_params": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "inference_cfg_rate": 0.7, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "sigma_min": 1e-06, "solver": "euler", "suppress_tokens": null, "t_scheduler": "cosine", "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "training_cfg_rate": 0.2, "typical_p": 1.0, "use_bfloat16": false }, "channels": [ 256 ], "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diffusion_steps": 10, "diversity_penalty": 0.0, "do_sample": false, "dropout": 0.0, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "in_channels": 80, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "n_blocks": 4, "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_heads": 8, "num_mid_blocks": 12, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "prenet_activation_function": "gelu", "prenet_attention_heads": 8, "prenet_d_model": 512, "prenet_ffn_dim": 2048, "prenet_in_dim": 1280, "prenet_max_source_positions": 5000, "prenet_nlayers": 12, "prenet_out_dim": 80, "prenet_target_mel_length_scale_ratio": 1.0, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "spk_emb_dim": 0, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "use_hidden_states_before_dconv2": true }, "hop_length": 160, "init_std": 0.02, "kernel_size": 3, "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_prob": 0.05, "max_audio_seconds": 30, "max_source_positions": 1500, "max_target_positions": 448, "median_filter_width": 7, "model_type": "longcat_next_audio", "n_fft": 400, "num_hidden_layers": 32, "num_mel_bins": 128, "sampling_rate": 16000, "scale_embedding": false, "stride_size": 2, "use_cache": true, "use_weighted_layer_sum": false, "vocab_size": 51865, "vocoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "channels": [ 256, 256, 256, 256, 256 ], "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hop_length": 256, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_mel_bins": 80, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sampling_rate": 16000, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false }, "vq_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "codebook_sizes": [ 8192, 4096, 2048, 1024, 1024, 1024, 1024, 1024 ], "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false } }, "audio_offset": 131125, "auto_map": { "AutoConfig": "configuration_longcat_next.LongcatNextConfig", "AutoModel": "modeling_longcat_next.LongcatNextModel", "AutoModelForCausalLM": "modeling_longcat_next.LongcatNextForCausalLM" }, "bos_token_id": 1, "dtype": "bfloat16", "emb_neighbor_num": 4, "emb_split_num": 4, "eos_token_id": 2, "expert_ffn_hidden_size": 1024, "ffn_hidden_size": 6144, "head_dim": 64, "hidden_act": "silu", "hidden_size": 3072, "initializer_range": 0.02, "kv_lora_rank": 512, "max_position_embeddings": 131072, "mla_scale_kv_lora": true, "mla_scale_q_lora": true, "model_type": "longcat_next", "moe_topk": 12, "n_routed_experts": 256, "ngram_vocab_size_ratio": 78, "num_attention_heads": 32, "num_hidden_layers": 28, "num_key_value_heads": 32, "num_layers": 14, "oe_ignored_token_ids": [ 131072, 131073, 131074, 131075, 131076, 131077, 131078, 131079, 131080, 131081, 131082, 131083, 131084, 131085, 131086, 131087, 131088, 131089, 131090, 131091, 131092, 131093, 131094, 131095, 131096, 131097, 131098, 131099, 131100, 131101, 131102, 131103, 131104, 131105, 131106, 131107, 131108, 131109, 131110, 131111, 131112, 131113, 131114, 131115, 131116, 131117, 131118, 131119, 131120, 131121, 131122, 131123, 131124 ], "q_lora_rank": 1536, "qk_head_dim": 192, "qk_nope_head_dim": 128, "qk_rope_head_dim": 64, "quantization_config": { "autoround_version": "0.13.0", "batch_size": 1, "bits": 4, "block_name_to_quantize": "model.layers", "data_type": "int", "extra_config": { ".*classifier.*": { "bits": 16, "data_type": "float" }, "model.layers.0.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.1.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.10.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.11.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.12.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.13.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.2.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.3.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.4.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.5.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.6.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.7.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.8.mlp.router.classifier": { "bits": 16, "data_type": "float" }, "model.layers.9.mlp.router.classifier": { "bits": 16, "data_type": "float" } }, "gradient_accumulate_steps": 8, "group_size": 128, "packing_format": "auto_round:auto_gptq", "quant_method": "auto-round", "seqlen": 512, "sym": true }, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 10000000, "routed_scaling_factor": 6.0, "text_vocab_plus_multimodal_special_token_size": 131125, "text_vocab_size": 131072, "tie_word_embeddings": false, "transformers_version": "4.57.6", "use_cache": true, "v_head_dim": 128, "visual_config": { "depth": 32, "fullatt_block_indexes": [ 7, 15, 23, 31 ], "hidden_act": "silu", "hidden_size": 1280, "image_end_token_id": 131107, "image_head_transformer_dims": 2048, "image_head_transformer_ffn_scale": 16, "image_head_transformer_layers": 4, "image_newline_token_id": 131109, "image_pad_token_id": 131108, "image_start_token_id": 131106, "in_channels": 3, "initializer_range": 0.02, "intermediate_size": 3420, "model_type": "longcat_next_visual", "num_heads": 16, "out_hidden_size": 3584, "patch_size": 14, "spatial_merge_size": 2, "temporal_patch_size": 2, "tokens_per_second": 4, "visual_decoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "codebook_dim": 3584, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_decoder_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "codebook_dim": 3584, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "distill_taps": [ 3, 7, 15, 23 ], "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "gelu", "hidden_size": 1024, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "intermediate_size": 2730, "is_decoder": false, "is_encoder_decoder": false, "k_bias": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "layer_norm_eps": 1e-06, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_attention_heads": 16, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 32, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 14, "prefix": null, "problem_type": null, "pruned_heads": {}, "q_bias": true, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "spatial_merge_size": 2, "subln": true, "suppress_tokens": null, "swiglu": true, "task_specific_params": null, "teacher_dims": { "15": 1280, "23": 1280, "3": 1280, "7": 1280 }, "temperature": 1.0, "temporal_patch_size": 2, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "v_bias": true }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "scheduler_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "dynamic_time_shift": true, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "num_train_timesteps": 1000, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false }, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "transformer_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "axes_dim_rope": [ 40, 40, 40 ], "axes_lens": [ 10000, 10000, 10000 ], "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_size": 2520, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "in_channels": 16, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "multiple_of": 256, "no_repeat_ngram_size": 0, "norm_eps": 1e-05, "num_attention_heads": 21, "num_beam_groups": 1, "num_beams": 1, "num_kv_heads": 7, "num_layers": 32, "num_refiner_layers": 2, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_size": 2, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "text_feat_dim": 2048, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "timestep_scale": 1000.0, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false }, "typical_p": 1.0, "use_bfloat16": false, "vae_config": { "_name_or_path": "", "act_fn": "silu", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "block_out_channels": [ 128, 256, 512, 512 ], "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "down_block_types": [ "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D" ], "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "force_upcast": true, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "in_channels": 3, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "latent_channels": 16, "layers_per_block": 2, "length_penalty": 1.0, "max_length": 20, "mid_block_add_attention": true, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "norm_num_groups": 32, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "out_channels": 3, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sample_size": 1024, "scaling_factor": 0.3611, "sep_token_id": null, "shift_factor": 0.1159, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "up_block_types": [ "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D" ], "use_bfloat16": false, "use_post_quant_conv": false, "use_quant_conv": false }, "weight_path": "WEIGHT_PATH_TO_LONGCAT_NEXT/image_decoder/image_decoder.safetensors" }, "vq_config": { "_name_or_path": "", "add_cross_attention": false, "architectures": null, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "codebook_dim": 3584, "codebook_size": 16384, "codebook_sizes": [ 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384 ], "commit_loss_ratio": 0.25, "cross_attention_hidden_size": null, "decay": 0.99, "decoder_start_token_id": null, "depth": 8, "diversity_penalty": 0.0, "do_sample": false, "dtype": null, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "entropy_loss_ratio": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "in_channels": 3584, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "model_type": "", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "quant_conv": true, "quantizer_type": "rq", "remove_invalid_values": false, "repetition_penalty": 1.0, "restart_unused_codes": true, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "shared_codebook": true, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "vq_loss_ratio": 0 }, "window_size": 112 }, "visual_embedding_layer_hidden_act": "silu", "visual_embedding_layer_intermediate_size": 8192, "visual_offset": 150581, "vocab_size": 282624, "zero_expert_num": 128, "zero_expert_type": "identity" }