File size: 6,337 Bytes
74da6da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
from transformers.models.whisper.configuration_whisper import WhisperConfig
from .configuration_longcat_ngram import LongcatFlashNgramConfig
class LongcatNextConfig(LongcatFlashNgramConfig):
model_type = "longcat_next"
def __init__(
self,
vocab_size=131072,
hidden_size=6144,
num_hidden_layers=56,
num_layers=28,
num_attention_heads=64,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=131072,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
rope_theta=10000000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
ffn_hidden_size=12288,
q_lora_rank=1536,
kv_lora_rank=512,
qk_nope_head_dim=128,
qk_rope_head_dim=64,
head_dim=64,
v_head_dim=128,
qk_head_dim=None,
moe_topk=12,
n_routed_experts=512,
zero_expert_num=256,
expert_ffn_hidden_size=2048,
routed_scaling_factor=6.0,
emb_neighbor_num=None,
emb_split_num=None,
ngram_vocab_size_ratio=None,
oe_ignored_token_ids=[],
text_vocab_size=131072, # text vocab size (vocab_size = text_vocab_size + audio_token + visual_token + multimodal_special_token_list)
text_vocab_plus_multimodal_special_token_size=131125,
visual_embedding_layer_intermediate_size=8192,
visual_embedding_layer_hidden_act="silu",
visual_offset=150581,
audio_offset=131125,
visual_config={},
audio_config={},
**kwargs,
):
self.text_vocab_size = text_vocab_size
self.text_vocab_plus_multimodal_special_token_size = text_vocab_plus_multimodal_special_token_size
self.visual_embedding_layer_intermediate_size = visual_embedding_layer_intermediate_size
self.visual_embedding_layer_hidden_act = visual_embedding_layer_hidden_act
self.visual_offset = visual_offset
self.audio_offset = audio_offset
self.visual_config = LongcatNextVisualConfig(**visual_config)
self.audio_config = LongcatNextAudioConfig(**audio_config)
oe_ignored_token_ids = oe_ignored_token_ids or list(range(self.text_vocab_size, self.text_vocab_plus_multimodal_special_token_size))
super().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_layers=num_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
rms_norm_eps=rms_norm_eps,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
ffn_hidden_size=ffn_hidden_size,
q_lora_rank=q_lora_rank,
kv_lora_rank=kv_lora_rank,
qk_nope_head_dim=qk_nope_head_dim,
qk_rope_head_dim=qk_rope_head_dim,
head_dim=head_dim,
v_head_dim=v_head_dim,
qk_head_dim=qk_head_dim,
moe_topk=moe_topk,
n_routed_experts=n_routed_experts,
zero_expert_num=zero_expert_num,
expert_ffn_hidden_size=expert_ffn_hidden_size,
routed_scaling_factor=routed_scaling_factor,
emb_neighbor_num=emb_neighbor_num,
emb_split_num=emb_split_num,
ngram_vocab_size_ratio=ngram_vocab_size_ratio,
oe_ignored_token_ids=oe_ignored_token_ids,
**kwargs,
)
class LongcatNextVisualConfig(Qwen2_5_VLVisionConfig):
model_type = "longcat_next_visual"
base_config_key = ""
def __init__(
self,
image_start_token_id=131106,
image_end_token_id=131107,
image_pad_token_id=131108,
image_newline_token_id=131109,
vq_config={},
visual_decoder_config={},
**kwargs,
):
self.image_start_token_id = image_start_token_id
self.image_end_token_id = image_end_token_id
self.image_pad_token_id = image_pad_token_id
self.image_newline_token_id = image_newline_token_id
self.vq_config = PretrainedConfig(**vq_config)
self.visual_decoder_config = PretrainedConfig(**visual_decoder_config)
self.visual_decoder_config.image_decoder_config = PretrainedConfig(**getattr(self.visual_decoder_config, "image_decoder_config", {}))
self.visual_decoder_config.transformer_config = PretrainedConfig(**getattr(self.visual_decoder_config, "transformer_config", {}))
self.visual_decoder_config.vae_config = PretrainedConfig(**getattr(self.visual_decoder_config, "vae_config", {}))
self.visual_decoder_config.scheduler_config = PretrainedConfig(**getattr(self.visual_decoder_config, "scheduler_config", {}))
super().__init__(**kwargs)
class LongcatNextAudioConfig(WhisperConfig):
model_type = "longcat_next_audio"
base_config_key = ""
def __init__(
self,
vq_config={},
vocoder_config={},
flow_matching_config={},
cosy24kvocoder_config={},
**kwargs
):
self.vq_config = PretrainedConfig(**vq_config)
self.vocoder_config = PretrainedConfig(**vocoder_config)
self.flow_matching_config = PretrainedConfig(**flow_matching_config)
self.flow_matching_config.cfm_params = PretrainedConfig(**getattr(self.flow_matching_config, "cfm_params", {}))
self.cosy24kvocoder_config = PretrainedConfig(**cosy24kvocoder_config)
super().__init__(**kwargs)
__all__ = ["LongcatNextConfig", "LongcatNextVisualConfig", "LongcatNextAudioConfig"]
|