SGLang startup errors

#1
by fpjnijweide - opened

Hi,

Trying to run this with SGLang (latest glm5-hopper docker container) but getting

sglang | [2026-02-12 13:09:24 TP0] Scheduler hit an exception: Traceback (most recent call last):
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 3039, in run_scheduler_process
sglang | scheduler = Scheduler(
sglang | ^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 352, in init
sglang | self.init_model_worker()
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 541, in init_model_worker
sglang | self.init_tp_model_worker()
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/scheduler.py", line 503, in init_tp_model_worker
sglang | self.tp_worker = TpModelWorker(
sglang | ^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 242, in init
sglang | self._init_model_runner()
sglang | File "/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 325, in _init_model_runner
sglang | self._model_runner = ModelRunner(
sglang | ^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 403, in init
sglang | self.initialize(min_per_gpu_memory)
sglang | File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 483, in initialize
sglang | self.load_model()
sglang | File "/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 961, in load_model
sglang | self.model = self.loader.load_model(
sglang | ^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 662, in load_model
sglang | model = _initialize_model(
sglang | ^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/model_loader/loader.py", line 276, in _initialize_model
sglang | return model_class(**kwargs)
sglang | ^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/models/deepseek_v2.py", line 2799, in init
sglang | self.model = DeepseekV2Model(
sglang | ^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/models/deepseek_v2.py", line 2534, in init
sglang | self.layers, self.start_layer, self.end_layer = make_layers(
sglang | ^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/utils/common.py", line 647, in make_layers
sglang | + get_offloader().wrap_modules(
sglang | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/utils/offloader.py", line 36, in wrap_modules
sglang | return list(all_modules_generator)
sglang | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/utils/common.py", line 649, in
sglang | layer_fn(idx=idx, prefix=add_prefix(idx, prefix))
sglang | File "/sgl-workspace/sglang/python/sglang/srt/models/deepseek_v2.py", line 2536, in
sglang | lambda idx, prefix: DeepseekV2DecoderLayer(
sglang | ^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/models/deepseek_v2.py", line 2248, in init
sglang | self.self_attn = DeepseekV2AttentionMLA(
sglang | ^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/sgl-workspace/sglang/python/sglang/srt/models/deepseek_v2.py", line 1280, in init
sglang | and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.bfloat16
sglang | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
sglang | File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1964, in getattr
sglang | raise AttributeError(
sglang | AttributeError: 'ReplicatedLinear' object has no attribute 'weight'. Did you mean: 'qweight'?
sglang |
sglang | [2026-02-12 13:09:24] Received sigquit from a child process. It usually means the child failed.
sglang exited with code 0 (restarting)

Any ideas? Thanks :)

Example docker compose:

services:
  sglang:
    image: lmsysorg/sglang:glm5-hopper
    container_name: sglang
    restart: unless-stopped
    ports:
      - "8000:8000"
    entrypoint: python3 -m sglang.launch_server
    command: >
                --model-path INC4AI/GLM-5-int4-mixed-AutoRound
                --host 0.0.0.0
                --port 8000
                --trust-remote-code
                --tp-size 4
                --served-model-name DVGPT
                --mem-fraction-static 0.925
                --tool-call-parser glm47
                --reasoning-parser glm45
                --enable-metrics
    ipc: host
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
    privileged: true
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

meh, vllm also does not work

vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783] WorkerProc failed to start.
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783] Traceback (most recent call last):
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 754, in worker_main
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     worker = WorkerProc(*args, **kwargs)
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 580, in __init__
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     self.worker.load_model()
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 294, in load_model
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     self.model_runner.load_model(eep_scale_up=eep_scale_up)
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4143, in load_model
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     self.model = model_loader.load_model(
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]                  ^^^^^^^^^^^^^^^^^^^^^^^^
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 62, in load_model
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     self.load_weights(model, model_config)
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/default_loader.py", line 290, in load_weights
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     loaded_weights = model.load_weights(self.get_all_weights(model_config, model))
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 1483, in load_weights
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]     param = params_dict[name]
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783]             ~~~~~~~~~~~^^^^^^
vllm  | (Worker_TP3 pid=981) ERROR 02-12 17:29:59 [multiproc_executor.py:783] KeyError: 'model.layers.0.self_attn.weights_proj.qweight'

Sign up or log in to comment