Spaces:
Sleeping
Sleeping
Commit
·
63c8de5
1
Parent(s):
f886036
Disable vLLM by default on MIG devices
Browse files
app.py
CHANGED
|
@@ -15,11 +15,15 @@ from threading import Thread
|
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
| 17 |
# ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
|
|
|
|
| 18 |
if torch.cuda.is_available():
|
| 19 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 20 |
if not cuda_visible:
|
| 21 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 22 |
cuda_visible = "0"
|
|
|
|
|
|
|
|
|
|
| 23 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 24 |
print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 25 |
else:
|
|
@@ -36,6 +40,9 @@ except ImportError:
|
|
| 36 |
SamplingParams = None
|
| 37 |
print("Warning: vLLM not available, falling back to Transformers")
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
# Try to import LLM Compressor (for quantization - optional, vLLM has native AWQ support)
|
| 40 |
# Note: llm-compressor is only needed for quantizing models, not for loading pre-quantized AWQ models
|
| 41 |
# vLLM can load AWQ models natively without llm-compressor
|
|
@@ -329,7 +336,7 @@ def load_pipeline(model_name: str):
|
|
| 329 |
"""
|
| 330 |
# Try vLLM first (best performance with native AWQ support via llm-compressor)
|
| 331 |
# vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
|
| 332 |
-
if VLLM_AVAILABLE:
|
| 333 |
try:
|
| 334 |
print(f"🔄 Attempting to load {model_name} with vLLM (native AWQ support)...")
|
| 335 |
return load_vllm_model(model_name)
|
|
@@ -344,6 +351,9 @@ def load_pipeline(model_name: str):
|
|
| 344 |
print(f"✅ Using cached Transformers pipeline for {model_name}")
|
| 345 |
return PIPELINES[model_name]
|
| 346 |
|
|
|
|
|
|
|
|
|
|
| 347 |
model_config = MODELS[model_name]
|
| 348 |
repo = model_config["repo_id"]
|
| 349 |
tokenizer_repo = model_config.get("tokenizer_repo", None)
|
|
|
|
| 15 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 16 |
|
| 17 |
# ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
|
| 18 |
+
MIG_VISIBLE = False
|
| 19 |
if torch.cuda.is_available():
|
| 20 |
cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
|
| 21 |
if not cuda_visible:
|
| 22 |
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
| 23 |
cuda_visible = "0"
|
| 24 |
+
print("CUDA_VISIBLE_DEVICES was empty -> set to 0")
|
| 25 |
+
elif cuda_visible.startswith("MIG"):
|
| 26 |
+
MIG_VISIBLE = True
|
| 27 |
print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
|
| 28 |
print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
|
| 29 |
else:
|
|
|
|
| 40 |
SamplingParams = None
|
| 41 |
print("Warning: vLLM not available, falling back to Transformers")
|
| 42 |
|
| 43 |
+
# Optional flag to disable vLLM (defaults to true on MIG due to device detection instability)
|
| 44 |
+
DISABLE_VLLM = os.environ.get("DISABLE_VLLM", "1" if MIG_VISIBLE else "0") == "1"
|
| 45 |
+
|
| 46 |
# Try to import LLM Compressor (for quantization - optional, vLLM has native AWQ support)
|
| 47 |
# Note: llm-compressor is only needed for quantizing models, not for loading pre-quantized AWQ models
|
| 48 |
# vLLM can load AWQ models natively without llm-compressor
|
|
|
|
| 336 |
"""
|
| 337 |
# Try vLLM first (best performance with native AWQ support via llm-compressor)
|
| 338 |
# vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
|
| 339 |
+
if VLLM_AVAILABLE and not DISABLE_VLLM:
|
| 340 |
try:
|
| 341 |
print(f"🔄 Attempting to load {model_name} with vLLM (native AWQ support)...")
|
| 342 |
return load_vllm_model(model_name)
|
|
|
|
| 351 |
print(f"✅ Using cached Transformers pipeline for {model_name}")
|
| 352 |
return PIPELINES[model_name]
|
| 353 |
|
| 354 |
+
if DISABLE_VLLM and VLLM_AVAILABLE:
|
| 355 |
+
print("⚠️ vLLM disabled for this deployment (DISABLE_VLLM=1 or MIG device detected)")
|
| 356 |
+
|
| 357 |
model_config = MODELS[model_name]
|
| 358 |
repo = model_config["repo_id"]
|
| 359 |
tokenizer_repo = model_config.get("tokenizer_repo", None)
|