| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| SUPPORTED_MOE_MODELS = [] |
|
|
| try: |
| from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM, DeepseekV3ForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(DeepseekV2ForCausalLM) |
| SUPPORTED_MOE_MODELS.append(DeepseekV3ForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.mixtral import MixtralForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(MixtralForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(Qwen2MoeForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(Qwen3MoeForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.qwen3_vl_moe import Qwen3MoeLLMForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(Qwen3MoeLLMForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.qwen3_next import Qwen3NextForCausalLM |
|
|
| SUPPORTED_MOE_MODELS.append(Qwen3NextForCausalLM) |
| except ImportError: |
| pass |
|
|
| try: |
| from vllm.model_executor.models.kimi_vl import KimiVLForConditionalGeneration |
|
|
| SUPPORTED_MOE_MODELS.append(KimiVLForConditionalGeneration) |
| except ImportError: |
| pass |
|
|
|
|
| def patch_vllm_moe_model_weight_loader(model): |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| if not SUPPORTED_MOE_MODELS: |
| return |
|
|
| original_model_type = type(model) |
| if hasattr(model, "runnable") and "ACLGraphWrapper" in str(original_model_type): |
| model = model.runnable |
| original_model_type = type(model) |
|
|
| |
| MLP_ATTR_MAPPING = {} |
| try: |
| from vllm.model_executor.models.mixtral import MixtralForCausalLM |
|
|
| MLP_ATTR_MAPPING[MixtralForCausalLM] = "block_sparse_moe" |
| except ImportError: |
| pass |
|
|
| DEFAULT_MLP_ATTR = "mlp" |
|
|
| |
| inner_model = getattr(model, "model", None) or getattr(model, "language_model", None) |
| if inner_model is None: |
| raise ValueError("The provided model does not have a valid 'model' or 'language_model' attribute.") |
|
|
| if not isinstance(model, tuple(SUPPORTED_MOE_MODELS)) and not isinstance(inner_model, tuple(SUPPORTED_MOE_MODELS)): |
| return |
|
|
| |
| |
| if type(inner_model).__name__ == "Qwen3MoeLLMForCausalLM": |
| inner_model = inner_model.model |
|
|
| for layer_idx, layer in enumerate(inner_model.layers): |
| mlp_attr = MLP_ATTR_MAPPING.get(original_model_type, DEFAULT_MLP_ATTR) |
|
|
| mlp = getattr(layer, mlp_attr, None) |
| if not mlp: |
| continue |
|
|
| experts = getattr(mlp, "experts", None) |
| if not experts or not hasattr(experts, "weight_loader"): |
| continue |
|
|
| |
| for name, param in mlp.named_parameters(): |
| if "w13_weight" in name or "w2_weight" in name: |
| param.weight_loader = experts.weight_loader |
|
|