Spaces:
Paused
Paused
Commit
·
71a896b
1
Parent(s):
3c9a722
Fix vLLM multi-GPU init: explicit dtype + higher mem util + eager mode
Browse filesBased on successful RTX 4090D config from HF discussions:
- Force env vars with os.environ[] (not setdefault)
- Add dtype="float16" explicitly
- Increase gpu_memory_utilization to 0.90
- Add enforce_eager=True for multi-GPU stability
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- models/real.py +11 -9
models/real.py
CHANGED
|
@@ -15,17 +15,18 @@ Model Loading:
|
|
| 15 |
import os
|
| 16 |
|
| 17 |
# vLLM environment variables - MUST be set before importing vLLM
|
| 18 |
-
# Force
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
-
#
|
| 22 |
# See: https://github.com/vllm-project/vllm/issues/17618
|
| 23 |
-
os.environ
|
| 24 |
|
| 25 |
-
# NCCL settings for
|
| 26 |
# See: https://github.com/vllm-project/vllm/issues/19002
|
| 27 |
-
os.environ
|
| 28 |
-
os.environ
|
| 29 |
|
| 30 |
import json
|
| 31 |
import logging
|
|
@@ -90,11 +91,12 @@ class RealModelStack:
|
|
| 90 |
|
| 91 |
self.models["vision"] = LLM(
|
| 92 |
model=settings.vision_model,
|
| 93 |
-
# FP8 quantization is built into model weights, no quantization param needed
|
| 94 |
tensor_parallel_size=settings.vllm_tensor_parallel_size,
|
| 95 |
trust_remote_code=True,
|
| 96 |
-
|
|
|
|
| 97 |
max_model_len=settings.vllm_max_model_len,
|
|
|
|
| 98 |
)
|
| 99 |
|
| 100 |
# Load processor for chat template formatting
|
|
|
|
| 15 |
import os
|
| 16 |
|
| 17 |
# vLLM environment variables - MUST be set before importing vLLM
|
| 18 |
+
# Force values (not setdefault) to override any pre-existing
|
| 19 |
+
# Force V0 engine (V1 has multi-GPU initialization issues)
|
| 20 |
+
os.environ["VLLM_USE_V1"] = "0"
|
| 21 |
|
| 22 |
+
# Force spawn method for tensor parallelism workers
|
| 23 |
# See: https://github.com/vllm-project/vllm/issues/17618
|
| 24 |
+
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
| 25 |
|
| 26 |
+
# NCCL settings for L4 GPU communication
|
| 27 |
# See: https://github.com/vllm-project/vllm/issues/19002
|
| 28 |
+
os.environ["NCCL_P2P_DISABLE"] = "1"
|
| 29 |
+
os.environ["NCCL_IB_DISABLE"] = "1"
|
| 30 |
|
| 31 |
import json
|
| 32 |
import logging
|
|
|
|
| 91 |
|
| 92 |
self.models["vision"] = LLM(
|
| 93 |
model=settings.vision_model,
|
|
|
|
| 94 |
tensor_parallel_size=settings.vllm_tensor_parallel_size,
|
| 95 |
trust_remote_code=True,
|
| 96 |
+
dtype="float16", # Explicit dtype (RTX 4090D success config)
|
| 97 |
+
gpu_memory_utilization=0.90, # Higher utilization for FP8 model
|
| 98 |
max_model_len=settings.vllm_max_model_len,
|
| 99 |
+
enforce_eager=True, # Disable CUDA graphs for multi-GPU stability
|
| 100 |
)
|
| 101 |
|
| 102 |
# Load processor for chat template formatting
|