Alikestocode commited on
Commit
63c8de5
·
1 Parent(s): f886036

Disable vLLM by default on MIG devices

Browse files
Files changed (1) hide show
  1. app.py +11 -1
app.py CHANGED
@@ -15,11 +15,15 @@ from threading import Thread
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
  # ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
 
18
  if torch.cuda.is_available():
19
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
20
  if not cuda_visible:
21
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
22
  cuda_visible = "0"
 
 
 
23
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
24
  print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
25
  else:
@@ -36,6 +40,9 @@ except ImportError:
36
  SamplingParams = None
37
  print("Warning: vLLM not available, falling back to Transformers")
38
 
 
 
 
39
  # Try to import LLM Compressor (for quantization - optional, vLLM has native AWQ support)
40
  # Note: llm-compressor is only needed for quantizing models, not for loading pre-quantized AWQ models
41
  # vLLM can load AWQ models natively without llm-compressor
@@ -329,7 +336,7 @@ def load_pipeline(model_name: str):
329
  """
330
  # Try vLLM first (best performance with native AWQ support via llm-compressor)
331
  # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
332
- if VLLM_AVAILABLE:
333
  try:
334
  print(f"🔄 Attempting to load {model_name} with vLLM (native AWQ support)...")
335
  return load_vllm_model(model_name)
@@ -344,6 +351,9 @@ def load_pipeline(model_name: str):
344
  print(f"✅ Using cached Transformers pipeline for {model_name}")
345
  return PIPELINES[model_name]
346
 
 
 
 
347
  model_config = MODELS[model_name]
348
  repo = model_config["repo_id"]
349
  tokenizer_repo = model_config.get("tokenizer_repo", None)
 
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
  # ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
18
+ MIG_VISIBLE = False
19
  if torch.cuda.is_available():
20
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
21
  if not cuda_visible:
22
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
23
  cuda_visible = "0"
24
+ print("CUDA_VISIBLE_DEVICES was empty -> set to 0")
25
+ elif cuda_visible.startswith("MIG"):
26
+ MIG_VISIBLE = True
27
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
28
  print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
29
  else:
 
40
  SamplingParams = None
41
  print("Warning: vLLM not available, falling back to Transformers")
42
 
43
+ # Optional flag to disable vLLM (defaults to true on MIG due to device detection instability)
44
+ DISABLE_VLLM = os.environ.get("DISABLE_VLLM", "1" if MIG_VISIBLE else "0") == "1"
45
+
46
  # Try to import LLM Compressor (for quantization - optional, vLLM has native AWQ support)
47
  # Note: llm-compressor is only needed for quantizing models, not for loading pre-quantized AWQ models
48
  # vLLM can load AWQ models natively without llm-compressor
 
336
  """
337
  # Try vLLM first (best performance with native AWQ support via llm-compressor)
338
  # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
339
+ if VLLM_AVAILABLE and not DISABLE_VLLM:
340
  try:
341
  print(f"🔄 Attempting to load {model_name} with vLLM (native AWQ support)...")
342
  return load_vllm_model(model_name)
 
351
  print(f"✅ Using cached Transformers pipeline for {model_name}")
352
  return PIPELINES[model_name]
353
 
354
+ if DISABLE_VLLM and VLLM_AVAILABLE:
355
+ print("⚠️ vLLM disabled for this deployment (DISABLE_VLLM=1 or MIG device detected)")
356
+
357
  model_config = MODELS[model_name]
358
  repo = model_config["repo_id"]
359
  tokenizer_repo = model_config.get("tokenizer_repo", None)