| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | from vllm.config import ModelConfig |
| |
|
| |
|
| | def get_attr_by_names(src_config, attrs, default_value): |
| | for attr in attrs: |
| | value = getattr(src_config, attr, 0) |
| | if value > 0: |
| | return value |
| | return default_value |
| |
|
| |
|
| | def _verify_with_expert_parallelism(self) -> None: |
| | num_expert_names = [ |
| | "moe_num_experts", |
| | "num_experts", |
| | "n_routed_experts", |
| | "num_local_experts", |
| | "num_routed_experts", |
| | ] |
| | num_experts = 0 |
| | for name in num_expert_names: |
| | num_experts = getattr(self.hf_text_config, name, 0) |
| | if num_experts > 0: |
| | break |
| | if num_experts < 1: |
| | raise ValueError( |
| | "Number of experts in the model must be greater than 0 " |
| | "when expert parallelism is enabled.") |
| |
|
| |
|
| | @property |
| | def is_deepseek_mla(self) -> bool: |
| | kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank'] |
| | kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, None) |
| | if not hasattr(self.hf_text_config, "model_type"): |
| | return False |
| | elif self.hf_text_config.model_type in \ |
| | ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'pangu_ultra_moe'): |
| | return kv_lora_dim is not None |
| | elif self.hf_text_config.model_type == 'eagle': |
| | |
| | |
| | return self.hf_text_config.model.model_type in \ |
| | ('deepseek_v2', 'deepseek_v3', 'pangu_ultra_moe') \ |
| | and kv_lora_dim is not None |
| | return False |
| |
|
| |
|
| | def get_head_size(self) -> int: |
| | if self.is_deepseek_mla: |
| | qk_rope_dim_names = ['attention_qk_rope_dim', 'qk_rope_head_dim'] |
| | kv_lora_dim_names = ['attention_kv_lora_dim', 'kv_lora_rank'] |
| | qk_rope_dim = get_attr_by_names(self.hf_text_config, qk_rope_dim_names, 0) |
| | kv_lora_dim = get_attr_by_names(self.hf_text_config, kv_lora_dim_names, 0) |
| | if self.use_mla: |
| | return kv_lora_dim + qk_rope_dim |
| | else: |
| | qk_dim_names = ['attention_qk_dim', 'qk_nope_head_dim'] |
| | qk_dim = get_attr_by_names(self.hf_text_config, qk_dim_names, 0) |
| | if qk_rope_dim and qk_dim: |
| | return qk_rope_dim + qk_dim |
| | if hasattr(self.hf_text_config, |
| | "model_type") and (self.hf_text_config.model_type |
| | == "zamba2"): |
| | return self.hf_text_config.attention_head_dim |
| |
|
| | if self.is_attention_free: |
| | return 0 |
| |
|
| | |
| | if getattr(self.hf_text_config, "head_dim", None) is not None: |
| | return self.hf_text_config.head_dim |
| |
|
| | |
| | return (self.hf_text_config.hidden_size // |
| | self.hf_text_config.num_attention_heads) |
| |
|
| |
|
| | ModelConfig._verify_with_expert_parallelism = _verify_with_expert_parallelism |
| | ModelConfig.is_deepseek_mla = is_deepseek_mla |
| | ModelConfig.get_head_size = get_head_size |
| |
|