File size: 24,129 Bytes
7c50656 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | /workspace/hanrui/sglang/python/sglang/launch_server.py:51: UserWarning: 'python -m sglang.launch_server' is still supported, but 'sglang serve' is the recommended entrypoint.
Example: sglang serve --model-path <model> [options]
warnings.warn(
[2026-03-07 15:24:13] INFO server_args.py:2048: Attention backend not specified. Use fa3 backend by default.
[2026-03-07 15:24:13] WARNING server_args.py:2629: Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests.
[2026-03-07 15:24:13] WARNING server_args.py:2650: Overlap scheduler is disabled when spec v2 is off or using unsupported speculative algorithm. You can set env SGLANG_ENABLE_SPEC_V2=True to enable the experimental overlap scheduler.
[2026-03-07 15:24:13] WARNING server_args.py:2712: speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1
[2026-03-07 15:24:14] server_args=ServerArgs(model_path='/workspace/models/Qwen3-8B', tokenizer_path='/workspace/models/Qwen3-8B', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=True, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='10.233.100.123', port=30000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_keyfile_password=None, enable_ssl_refresh=False, dtype='bfloat16', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.8, max_running_requests=48, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, disable_priority_preemption=False, default_priority_value=None, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=1, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=4, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, enable_streaming_session=False, random_seed=551181117, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, use_ray=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key=None, admin_api_key=None, served_model_name='/workspace/models/Qwen3-8B', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='fa3', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='flashinfer_cutlass', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm='STANDALONE', speculative_draft_model_path='/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-dflash-lora-merged', speculative_draft_model_revision='main', speculative_draft_load_format=None, speculative_num_steps=4, speculative_eagle_topk=1, speculative_num_draft_tokens=5, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, enable_aiter_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, enable_elastic_expert_backup=False, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, linear_attn_backend='triton', linear_attn_decode_backend=None, linear_attn_prefill_backend=None, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', disable_hicache_numa_detect=False, hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, hierarchical_sparse_attention_extra_config=None, enable_lmcache=False, kt_weight_path=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=512, cuda_graph_bs=[1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 40, 44, 48, 52, 56, 60, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=True, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, disable_piecewise_cuda_graph=True, enforce_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=8192, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_fused_moe_sum_all_reduce=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], enable_adaptive_dispatch_to_encoder=False, custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, enable_mm_global_cache=False, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
[2026-03-07 15:24:15] Using default HuggingFace chat template with detected content format: string
[2026-03-07 15:24:25 TP2] Mamba selective_state_update backend initialized: triton
[2026-03-07 15:24:25 TP2] Init torch distributed begin.
[2026-03-07 15:24:26 TP0] Mamba selective_state_update backend initialized: triton
[2026-03-07 15:24:26 TP0] Init torch distributed begin.
[2026-03-07 15:24:26 TP3] Mamba selective_state_update backend initialized: triton
[2026-03-07 15:24:26 TP1] Mamba selective_state_update backend initialized: triton
[2026-03-07 15:24:26 TP3] Init torch distributed begin.
[2026-03-07 15:24:26 TP1] Init torch distributed begin.
[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[2026-03-07 15:24:27 TP0] sglang is using nccl==2.27.5
[2026-03-07 15:24:29 TP0] Scheduler hit an exception: Traceback (most recent call last):
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
self.init_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
self.init_tp_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
self._init_model_runner()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
pre_model_load_memory = self.init_torch_distributed()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
initialize_model_parallel(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
_TP = init_model_parallel_group(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
return GroupCoordinator(
^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
self.pynccl_comm = PyNcclCommunicator(
^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
self.NCCL_CHECK(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
raise RuntimeError(f"NCCL error: {error_str}")
RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
[2026-03-07 15:24:29 TP2] Scheduler hit an exception: Traceback (most recent call last):
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
self.init_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
self.init_tp_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
self._init_model_runner()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
pre_model_load_memory = self.init_torch_distributed()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
initialize_model_parallel(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
_TP = init_model_parallel_group(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
return GroupCoordinator(
^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
self.pynccl_comm = PyNcclCommunicator(
^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
self.NCCL_CHECK(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
raise RuntimeError(f"NCCL error: {error_str}")
RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
[2026-03-07 15:24:29 TP1] Scheduler hit an exception: Traceback (most recent call last):
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
self.init_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
self.init_tp_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
self._init_model_runner()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
pre_model_load_memory = self.init_torch_distributed()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
initialize_model_parallel(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
_TP = init_model_parallel_group(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
return GroupCoordinator(
^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
self.pynccl_comm = PyNcclCommunicator(
^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
self.NCCL_CHECK(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
raise RuntimeError(f"NCCL error: {error_str}")
RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
[2026-03-07 15:24:29 TP3] Scheduler hit an exception: Traceback (most recent call last):
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 3239, in run_scheduler_process
scheduler = Scheduler(
^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 365, in __init__
self.init_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 561, in init_model_worker
self.init_tp_model_worker()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/scheduler.py", line 519, in init_tp_model_worker
self.tp_worker = TpModelWorker(
^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 258, in __init__
self._init_model_runner()
File "/workspace/hanrui/sglang/python/sglang/srt/managers/tp_worker.py", line 341, in _init_model_runner
self._model_runner = ModelRunner(
^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 395, in __init__
pre_model_load_memory = self.init_torch_distributed()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/model_executor/model_runner.py", line 813, in init_torch_distributed
initialize_model_parallel(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1764, in initialize_model_parallel
_TP = init_model_parallel_group(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 1450, in init_model_parallel_group
return GroupCoordinator(
^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/parallel_state.py", line 357, in __init__
self.pynccl_comm = PyNcclCommunicator(
^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py", line 113, in __init__
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 401, in ncclCommInitRank
self.NCCL_CHECK(
File "/workspace/hanrui/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py", line 376, in NCCL_CHECK
raise RuntimeError(f"NCCL error: {error_str}")
RuntimeError: NCCL error: unhandled system error (run with NCCL_DEBUG=INFO for details)
[2026-03-07 15:24:29] Received sigquit from a child process. It usually means the child failed.
|