Spaces:

KinetoLabs
/

SmokeScan

Paused

KinetoLabs Claude Opus 4.5 commited on Jan 11

Commit

71a896b

1 Parent(s): 3c9a722

Fix vLLM multi-GPU init: explicit dtype + higher mem util + eager mode

Based on successful RTX 4090D config from HF discussions:
- Force env vars with os.environ[] (not setdefault)
- Add dtype="float16" explicitly
- Increase gpu_memory_utilization to 0.90
- Add enforce_eager=True for multi-GPU stability

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

models/real.py +11 -9

models/real.py CHANGED Viewed

@@ -15,17 +15,18 @@ Model Loading:
 import os
 # vLLM environment variables - MUST be set before importing vLLM
-# Force V0 engine (more stable than V1 for multi-GPU)
-os.environ.setdefault("VLLM_USE_V1", "0")
-# Fix for "Engine core initialization failed" with tensor parallelism
 # See: https://github.com/vllm-project/vllm/issues/17618
-os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-# NCCL settings for multi-GPU reliability
 # See: https://github.com/vllm-project/vllm/issues/19002
-os.environ.setdefault("NCCL_P2P_DISABLE", "1")
-os.environ.setdefault("NCCL_IB_DISABLE", "1")
 import json
 import logging
@@ -90,11 +91,12 @@ class RealModelStack:
         self.models["vision"] = LLM(
             model=settings.vision_model,
-            # FP8 quantization is built into model weights, no quantization param needed
             tensor_parallel_size=settings.vllm_tensor_parallel_size,
             trust_remote_code=True,
-            gpu_memory_utilization=0.70,  # Per Qwen FP8 model recommendations
             max_model_len=settings.vllm_max_model_len,
         )
         # Load processor for chat template formatting

 import os
 # vLLM environment variables - MUST be set before importing vLLM
+# Force values (not setdefault) to override any pre-existing
+# Force V0 engine (V1 has multi-GPU initialization issues)
+os.environ["VLLM_USE_V1"] = "0"
+# Force spawn method for tensor parallelism workers
 # See: https://github.com/vllm-project/vllm/issues/17618
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# NCCL settings for L4 GPU communication
 # See: https://github.com/vllm-project/vllm/issues/19002
+os.environ["NCCL_P2P_DISABLE"] = "1"
+os.environ["NCCL_IB_DISABLE"] = "1"
 import json
 import logging
         self.models["vision"] = LLM(
             model=settings.vision_model,
             tensor_parallel_size=settings.vllm_tensor_parallel_size,
             trust_remote_code=True,
+            dtype="float16",              # Explicit dtype (RTX 4090D success config)
+            gpu_memory_utilization=0.90,  # Higher utilization for FP8 model
             max_model_len=settings.vllm_max_model_len,
+            enforce_eager=True,           # Disable CUDA graphs for multi-GPU stability
         )
         # Load processor for chat template formatting