Spaces:
Sleeping
Sleeping
Commit
Β·
8fd14bc
1
Parent(s):
5ee455a
Adjust CUDA handling and set explicit device for vLLM
Browse files
app.py
CHANGED
|
@@ -14,17 +14,14 @@ from threading import Thread
|
|
| 14 |
# Enable optimizations
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
# vLLM needs explicit CUDA device configuration
|
| 19 |
-
# ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
|
| 20 |
if torch.cuda.is_available():
|
| 21 |
-
# Set CUDA_VISIBLE_DEVICES if not already set or if it's a MIG UUID
|
| 22 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 23 |
-
if not cuda_visible
|
| 24 |
-
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 25 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
|
|
| 26 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 27 |
-
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 28 |
else:
|
| 29 |
print("WARNING: CUDA not available - vLLM will not work")
|
| 30 |
|
|
@@ -213,32 +210,31 @@ def load_vllm_model(model_name: str):
|
|
| 213 |
}
|
| 214 |
|
| 215 |
# Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
|
| 216 |
-
# ZeroGPU
|
| 217 |
-
# IMPORTANT: Set this BEFORE creating LLM() instance, as vLLM checks device during init
|
| 218 |
-
# Also need to ensure torch sees the change immediately
|
| 219 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 220 |
-
if not cuda_visible
|
| 221 |
-
# If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
|
| 222 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
# This ensures vLLM sees the correct device
|
| 226 |
try:
|
| 227 |
if hasattr(torch.cuda, '_lazy_init'):
|
| 228 |
torch.cuda._lazy_init()
|
| 229 |
except Exception:
|
| 230 |
pass
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
| 234 |
if torch.cuda.is_available():
|
| 235 |
-
# Verify device is accessible
|
| 236 |
device_name = torch.cuda.get_device_name(0)
|
| 237 |
print(f" β Verified CUDA device accessible: {device_name}")
|
| 238 |
-
# Explicitly set default device to ensure vLLM can detect it
|
| 239 |
torch.cuda.set_device(0)
|
| 240 |
-
print(
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 243 |
if quantization == "awq":
|
| 244 |
llm_kwargs["quantization"] = "awq"
|
|
@@ -265,6 +261,9 @@ def load_vllm_model(model_name: str):
|
|
| 265 |
except Exception:
|
| 266 |
print(f" β FP8 quantization not available, falling back to bf16")
|
| 267 |
|
|
|
|
|
|
|
|
|
|
| 268 |
print(f" β Loading with vLLM (continuous batching, PagedAttention)...")
|
| 269 |
llm = LLM(**llm_kwargs)
|
| 270 |
VLLM_MODELS[model_name] = llm
|
|
|
|
| 14 |
# Enable optimizations
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
| 17 |
+
# ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
|
|
|
|
|
|
|
| 18 |
if torch.cuda.is_available():
|
|
|
|
| 19 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 20 |
+
if not cuda_visible:
|
|
|
|
| 21 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 22 |
+
cuda_visible = "0"
|
| 23 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 24 |
+
print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 25 |
else:
|
| 26 |
print("WARNING: CUDA not available - vLLM will not work")
|
| 27 |
|
|
|
|
| 210 |
}
|
| 211 |
|
| 212 |
# Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
|
| 213 |
+
# ZeroGPU exposes MIG UUIDs; keep them unless the variable is empty
|
|
|
|
|
|
|
| 214 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 215 |
+
if not cuda_visible:
|
|
|
|
| 216 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 217 |
+
cuda_visible = "0"
|
| 218 |
+
print(" β CUDA_VISIBLE_DEVICES was empty, set to 0")
|
|
|
|
| 219 |
try:
|
| 220 |
if hasattr(torch.cuda, '_lazy_init'):
|
| 221 |
torch.cuda._lazy_init()
|
| 222 |
except Exception:
|
| 223 |
pass
|
| 224 |
+
else:
|
| 225 |
+
print(f" β CUDA_VISIBLE_DEVICES retained: {cuda_visible}")
|
| 226 |
+
|
| 227 |
+
# Force torch to see the correct device after ensuring CUDA_VISIBLE_DEVICES
|
| 228 |
if torch.cuda.is_available():
|
|
|
|
| 229 |
device_name = torch.cuda.get_device_name(0)
|
| 230 |
print(f" β Verified CUDA device accessible: {device_name}")
|
|
|
|
| 231 |
torch.cuda.set_device(0)
|
| 232 |
+
print(" β Set torch.cuda default device to 0")
|
| 233 |
+
|
| 234 |
+
# Disable Ray executor on ZeroGPU to simplify device handling
|
| 235 |
+
os.environ.setdefault("VLLM_USE_RAY", "0")
|
| 236 |
+
os.environ.setdefault("VLLM_WORKER_USE_RAY", "0")
|
| 237 |
+
|
| 238 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 239 |
if quantization == "awq":
|
| 240 |
llm_kwargs["quantization"] = "awq"
|
|
|
|
| 261 |
except Exception:
|
| 262 |
print(f" β FP8 quantization not available, falling back to bf16")
|
| 263 |
|
| 264 |
+
# Explicitly select CUDA device and single-process executor
|
| 265 |
+
llm_kwargs["device"] = "cuda" if torch.cuda.is_available() else "cpu"
|
| 266 |
+
|
| 267 |
print(f" β Loading with vLLM (continuous batching, PagedAttention)...")
|
| 268 |
llm = LLM(**llm_kwargs)
|
| 269 |
VLLM_MODELS[model_name] = llm
|