KinetoLabs Claude Opus 4.5 commited on
Commit
71a896b
·
1 Parent(s): 3c9a722

Fix vLLM multi-GPU init: explicit dtype + higher mem util + eager mode

Browse files

Based on successful RTX 4090D config from HF discussions:
- Force env vars with os.environ[] (not setdefault)
- Add dtype="float16" explicitly
- Increase gpu_memory_utilization to 0.90
- Add enforce_eager=True for multi-GPU stability

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. models/real.py +11 -9
models/real.py CHANGED
@@ -15,17 +15,18 @@ Model Loading:
15
  import os
16
 
17
  # vLLM environment variables - MUST be set before importing vLLM
18
- # Force V0 engine (more stable than V1 for multi-GPU)
19
- os.environ.setdefault("VLLM_USE_V1", "0")
 
20
 
21
- # Fix for "Engine core initialization failed" with tensor parallelism
22
  # See: https://github.com/vllm-project/vllm/issues/17618
23
- os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
24
 
25
- # NCCL settings for multi-GPU reliability
26
  # See: https://github.com/vllm-project/vllm/issues/19002
27
- os.environ.setdefault("NCCL_P2P_DISABLE", "1")
28
- os.environ.setdefault("NCCL_IB_DISABLE", "1")
29
 
30
  import json
31
  import logging
@@ -90,11 +91,12 @@ class RealModelStack:
90
 
91
  self.models["vision"] = LLM(
92
  model=settings.vision_model,
93
- # FP8 quantization is built into model weights, no quantization param needed
94
  tensor_parallel_size=settings.vllm_tensor_parallel_size,
95
  trust_remote_code=True,
96
- gpu_memory_utilization=0.70, # Per Qwen FP8 model recommendations
 
97
  max_model_len=settings.vllm_max_model_len,
 
98
  )
99
 
100
  # Load processor for chat template formatting
 
15
  import os
16
 
17
  # vLLM environment variables - MUST be set before importing vLLM
18
+ # Force values (not setdefault) to override any pre-existing
19
+ # Force V0 engine (V1 has multi-GPU initialization issues)
20
+ os.environ["VLLM_USE_V1"] = "0"
21
 
22
+ # Force spawn method for tensor parallelism workers
23
  # See: https://github.com/vllm-project/vllm/issues/17618
24
+ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
25
 
26
+ # NCCL settings for L4 GPU communication
27
  # See: https://github.com/vllm-project/vllm/issues/19002
28
+ os.environ["NCCL_P2P_DISABLE"] = "1"
29
+ os.environ["NCCL_IB_DISABLE"] = "1"
30
 
31
  import json
32
  import logging
 
91
 
92
  self.models["vision"] = LLM(
93
  model=settings.vision_model,
 
94
  tensor_parallel_size=settings.vllm_tensor_parallel_size,
95
  trust_remote_code=True,
96
+ dtype="float16", # Explicit dtype (RTX 4090D success config)
97
+ gpu_memory_utilization=0.90, # Higher utilization for FP8 model
98
  max_model_len=settings.vllm_max_model_len,
99
+ enforce_eager=True, # Disable CUDA graphs for multi-GPU stability
100
  )
101
 
102
  # Load processor for chat template formatting