JustinTX's picture
Add files using upload-large-folder tool
d7b3a74 verified
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index bdb124e51..3edf30ab1 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -454,14 +454,14 @@ class ModelConfig:
).lower()
# Detect which checkpoint is it
- for _, method in QUANTIZATION_METHODS.items():
- quantization_override = method.override_quantization_method(
- quant_cfg, self.quantization
- )
- if quantization_override:
- quant_method = quantization_override
- self.quantization = quantization_override
- break
+ # for _, method in QUANTIZATION_METHODS.items():
+ # quantization_override = method.override_quantization_method(
+ # quant_cfg, self.quantization
+ # )
+ # if quantization_override:
+ # quant_method = quantization_override
+ # self.quantization = quantization_override
+ # break
# Verify quantization configurations.
if self.quantization is None:
diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py
index 2dd2c75f1..f2adb18f8 100644
--- a/python/sglang/srt/entrypoints/http_server.py
+++ b/python/sglang/srt/entrypoints/http_server.py
@@ -264,6 +264,10 @@ async def validate_json_request(raw_request: Request):
@app.get("/health")
+async def health(request: Request) -> Response:
+ return Response(status_code=200)
+
+
@app.get("/health_generate")
async def health_generate(request: Request) -> Response:
"""
diff --git a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
index 372717bf9..40665cc90 100644
--- a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
+++ b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
@@ -190,6 +190,7 @@ class DeepEPBuffer:
f"Consider using --deepep-config to change the behavior."
)
+ num_qps_per_rank = 20
cls._buffer = Buffer(
group,
num_nvl_bytes,
diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py
index 956264fc9..69f729336 100644
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -351,10 +351,10 @@ class Fp8LinearMethod(LinearMethodBase):
return
else:
weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
- layer.weight = torch.nn.Parameter(weight, requires_grad=False)
- layer.weight_scale_inv = torch.nn.Parameter(
- weight_scale, requires_grad=False
- )
+ # layer.weight = torch.nn.Parameter(weight, requires_grad=False)
+ # layer.weight_scale_inv = torch.nn.Parameter(
+ # weight_scale, requires_grad=False
+ # )
return
layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
index 95a529c89..758fbfd5f 100644
--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1359,7 +1359,7 @@ class Scheduler(
if memory_leak:
msg = "token_to_kv_pool_allocator memory leak detected! " f"{token_msg}"
- raise ValueError(msg)
+ # raise ValueError(msg)
if self.disaggregation_mode == DisaggregationMode.DECODE:
req_total_size = (
@@ -1374,7 +1374,7 @@ class Scheduler(
f"available_size={len(self.req_to_token_pool.free_slots)}, "
f"total_size={self.req_to_token_pool.size}\n"
)
- raise ValueError(msg)
+ # raise ValueError(msg)
if (
self.enable_metrics
@@ -1830,6 +1830,7 @@ class Scheduler(
deepep_mode=DeepEPMode(self.server_args.deepep_mode),
require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
disable_overlap_schedule=self.server_args.disable_overlap_schedule,
+ offload_tags=self.offload_tags,
)
def handle_dp_balance_data(self, local_batch: ScheduleBatch):
@@ -1927,6 +1928,7 @@ class Scheduler(
deepep_mode: DeepEPMode,
require_mlp_tp_gather: bool,
disable_overlap_schedule: bool,
+ offload_tags: set[str],
):
# Check if other DP workers have running batches
if local_batch is None:
@@ -1957,7 +1959,7 @@ class Scheduler(
)
tbo_preparer = TboDPAttentionPreparer()
- if disable_overlap_schedule:
+ if len(offload_tags) == 0 and disable_overlap_schedule:
group = tp_group.device_group
device = tp_group.device
else:
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
index 58220b1d6..3c3d081a8 100644
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1044,10 +1044,15 @@ class TokenizerManager:
request: Optional[fastapi.Request] = None,
) -> Tuple[bool, str]:
self.auto_create_handle_loop()
- assert (
- self.server_args.dp_size == 1
- ), "dp_size must be 1 for init parameter update group"
- result = (await self.init_weights_update_group_communicator(obj))[0]
+ results = await self.init_weights_update_group_communicator(obj)
+ if self.server_args.dp_size == 1:
+ result = results[0]
+ return result.success, result.message
+ else:
+ all_success = all([r.success for r in results])
+ all_message = [r.message for r in results]
+ all_message = " | ".join(all_message)
+ return all_success, all_message
return result.success, result.message
async def update_weights_from_distributed(
@@ -1056,9 +1061,6 @@ class TokenizerManager:
request: Optional[fastapi.Request] = None,
) -> Tuple[bool, str]:
self.auto_create_handle_loop()
- assert (
- self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
- ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
if obj.abort_all_requests:
self.abort_request(abort_all=True)
@@ -1066,8 +1068,15 @@ class TokenizerManager:
# This means that weight sync
# cannot run while requests are in progress.
async with self.model_update_lock.writer_lock:
- result = (await self.update_weights_from_distributed_communicator(obj))[0]
- return result.success, result.message
+ results = await self.update_weights_from_distributed_communicator(obj)
+ if self.server_args.dp_size == 1:
+ result = results[0]
+ return result.success, result.message
+ else:
+ all_success = all([r.success for r in results])
+ all_message = [r.message for r in results]
+ all_message = " | ".join(all_message)
+ return all_success, all_message
async def update_weights_from_tensor(
self,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 5222bff0a..ff0bbc62a 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -22,6 +22,7 @@ import os
import time
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
+from contextlib import nullcontext
import torch
import torch.distributed as dist
@@ -675,7 +676,7 @@ class ModelRunner:
monkey_patch_vllm_parallel_state()
monkey_patch_isinstance_for_vllm_base_layer()
- with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS):
+ with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_WEIGHTS) if not self.is_draft_worker else nullcontext():
self.model = get_model(
model_config=self.model_config,
load_config=self.load_config,
diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py
index e0f0b373d..a18ac10f1 100644
--- a/python/sglang/srt/models/glm4_moe.py
+++ b/python/sglang/srt/models/glm4_moe.py
@@ -1108,5 +1108,4 @@ class Glm4MoeForCausalLM(DeepseekV2ForCausalLM):
)
weight_loader(param, loaded_weight)
-
EntryClass = [Glm4MoeForCausalLM]