|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from vllm.config import ModelConfig |
|
|
|
|
|
|
|
|
def get_attr_by_names(src_config, attrs, default_value): |
|
|
for attr in attrs: |
|
|
value = getattr(src_config, attr, 0) |
|
|
if value > 0: |
|
|
return value |
|
|
return default_value |
|
|
|
|
|
|
|
|
def _verify_with_expert_parallelism(self) -> None: |
|
|
num_expert_names = [ |
|
|
"moe_num_experts", |
|
|
"num_experts", |
|
|
"n_routed_experts", |
|
|
"num_local_experts", |
|
|
"num_routed_experts", |
|
|
] |
|
|
num_experts = 0 |
|
|
for name in num_expert_names: |
|
|
num_experts = getattr(self.hf_text_config, name, 0) |
|
|
if num_experts > 0: |
|
|
break |
|
|
if num_experts < 1: |
|
|
raise ValueError( |
|
|
"Number of experts in the model must be greater than 0 " |
|
|
"when expert parallelism is enabled.") |
|
|
|
|
|
|
|
|
@property |
|
|
def is_deepseek_mla(self) -> bool: |
|
|
kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank'] |
|
|
kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, None) |
|
|
if not hasattr(self.hf_text_config, "model_type"): |
|
|
return False |
|
|
elif self.hf_text_config.model_type in \ |
|
|
('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'pangu_ultra_moe'): |
|
|
return kv_lora_dim is not None |
|
|
elif self.hf_text_config.model_type == 'eagle': |
|
|
|
|
|
|
|
|
return self.hf_text_config.model.model_type in \ |
|
|
('deepseek_v2', 'deepseek_v3', 'pangu_ultra_moe') \ |
|
|
and kv_lora_dim is not None |
|
|
return False |
|
|
|
|
|
|
|
|
def get_head_size(self) -> int: |
|
|
if self.is_deepseek_mla: |
|
|
qk_rope_dim_names = ['attention_qk_rope_dim', 'qk_rope_head_dim'] |
|
|
kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank'] |
|
|
qk_rope_dim = get_attr_by_names(self.hf_text_config, qk_rope_dim_names, 0) |
|
|
kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, 0) |
|
|
if self.use_mla: |
|
|
return kv_lora_dim + qk_rope_dim |
|
|
else: |
|
|
qk_dim_names = ['attention_qk_dim', 'qk_nope_head_dim'] |
|
|
qk_dim = get_attr_by_names(self.hf_text_config, qk_dim_names, 0) |
|
|
if qk_rope_dim and qk_dim: |
|
|
return qk_rope_dim + qk_dim |
|
|
if hasattr(self.hf_text_config, |
|
|
"model_type") and (self.hf_text_config.model_type |
|
|
== "zamba2"): |
|
|
return self.hf_text_config.attention_head_dim |
|
|
|
|
|
if self.is_attention_free: |
|
|
return 0 |
|
|
|
|
|
|
|
|
if getattr(self.hf_text_config, "head_dim", None) is not None: |
|
|
return self.hf_text_config.head_dim |
|
|
|
|
|
|
|
|
return (self.hf_text_config.hidden_size // |
|
|
self.hf_text_config.num_attention_heads) |
|
|
|
|
|
|
|
|
ModelConfig._verify_with_expert_parallelism = _verify_with_expert_parallelism |
|
|
ModelConfig.is_deepseek_mla = is_deepseek_mla |
|
|
ModelConfig.get_head_size = get_head_size |
|
|
|