Spaces:
Sleeping
Sleeping
Commit
·
2ddfeca
1
Parent(s):
b4fd5e9
Fix vLLM device detection for ZeroGPU
Browse files- Set CUDA_VISIBLE_DEVICES environment variable for vLLM
- Add CUDA detection logging at module level
- Ensure CUDA is available before attempting vLLM load
- Improve error messages for device detection issues
app.py
CHANGED
|
@@ -14,6 +14,17 @@ from threading import Thread
|
|
| 14 |
# Enable optimizations
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Try to import vLLM (primary inference engine)
|
| 18 |
try:
|
| 19 |
from vllm import LLM, SamplingParams
|
|
@@ -136,9 +147,19 @@ def load_vllm_model(model_name: str):
|
|
| 136 |
print(f"Loading {repo} with vLLM (quantization: {quantization})...")
|
| 137 |
|
| 138 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# vLLM configuration optimized for ZeroGPU H200 slice
|
| 140 |
# vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
|
| 141 |
# Note: HF_TOKEN is passed via environment variable, not as a parameter
|
|
|
|
| 142 |
llm_kwargs = {
|
| 143 |
"model": repo,
|
| 144 |
"trust_remote_code": True,
|
|
@@ -151,6 +172,10 @@ def load_vllm_model(model_name: str):
|
|
| 151 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 152 |
}
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 155 |
if quantization == "awq":
|
| 156 |
llm_kwargs["quantization"] = "awq"
|
|
|
|
| 14 |
# Enable optimizations
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
| 17 |
+
# Ensure CUDA is visible to vLLM on ZeroGPU
|
| 18 |
+
# vLLM needs explicit CUDA device configuration
|
| 19 |
+
if torch.cuda.is_available():
|
| 20 |
+
# Set CUDA_VISIBLE_DEVICES if not already set (helps vLLM detect GPU)
|
| 21 |
+
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
| 22 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 23 |
+
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 24 |
+
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 25 |
+
else:
|
| 26 |
+
print("WARNING: CUDA not available - vLLM will not work")
|
| 27 |
+
|
| 28 |
# Try to import vLLM (primary inference engine)
|
| 29 |
try:
|
| 30 |
from vllm import LLM, SamplingParams
|
|
|
|
| 147 |
print(f"Loading {repo} with vLLM (quantization: {quantization})...")
|
| 148 |
|
| 149 |
try:
|
| 150 |
+
# Detect device explicitly for vLLM
|
| 151 |
+
# vLLM needs explicit device configuration on ZeroGPU
|
| 152 |
+
if not torch.cuda.is_available():
|
| 153 |
+
raise RuntimeError("CUDA not available - vLLM requires GPU. Falling back to Transformers pipeline.")
|
| 154 |
+
|
| 155 |
+
print(f" → CUDA available: {torch.cuda.get_device_name(0)}")
|
| 156 |
+
print(f" → CUDA device count: {torch.cuda.device_count()}")
|
| 157 |
+
print(f" → CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 158 |
+
|
| 159 |
# vLLM configuration optimized for ZeroGPU H200 slice
|
| 160 |
# vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
|
| 161 |
# Note: HF_TOKEN is passed via environment variable, not as a parameter
|
| 162 |
+
# vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
|
| 163 |
llm_kwargs = {
|
| 164 |
"model": repo,
|
| 165 |
"trust_remote_code": True,
|
|
|
|
| 172 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 173 |
}
|
| 174 |
|
| 175 |
+
# Ensure CUDA_VISIBLE_DEVICES is set for vLLM device detection
|
| 176 |
+
if "CUDA_VISIBLE_DEVICES" not in os.environ:
|
| 177 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 178 |
+
|
| 179 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 180 |
if quantization == "awq":
|
| 181 |
llm_kwargs["quantization"] = "awq"
|