Alikestocode commited on
Commit
8fd14bc
Β·
1 Parent(s): 5ee455a

Adjust CUDA handling and set explicit device for vLLM

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -14,17 +14,14 @@ from threading import Thread
14
  # Enable optimizations
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
- # Ensure CUDA is visible to vLLM on ZeroGPU
18
- # vLLM needs explicit CUDA device configuration
19
- # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
20
  if torch.cuda.is_available():
21
- # Set CUDA_VISIBLE_DEVICES if not already set or if it's a MIG UUID
22
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
23
- if not cuda_visible or not cuda_visible.isdigit():
24
- # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
25
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
26
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
27
- print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
28
  else:
29
  print("WARNING: CUDA not available - vLLM will not work")
30
 
@@ -213,32 +210,31 @@ def load_vllm_model(model_name: str):
213
  }
214
 
215
  # Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
216
- # ZeroGPU uses MIG UUIDs, but vLLM needs numeric device index
217
- # IMPORTANT: Set this BEFORE creating LLM() instance, as vLLM checks device during init
218
- # Also need to ensure torch sees the change immediately
219
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
220
- if not cuda_visible or not cuda_visible.isdigit():
221
- # If CUDA_VISIBLE_DEVICES is a MIG UUID or empty, use "0" for single GPU
222
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
223
- print(f" β†’ Set CUDA_VISIBLE_DEVICES=0 (was: {cuda_visible})")
224
- # Force torch to reinitialize CUDA context after changing CUDA_VISIBLE_DEVICES
225
- # This ensures vLLM sees the correct device
226
  try:
227
  if hasattr(torch.cuda, '_lazy_init'):
228
  torch.cuda._lazy_init()
229
  except Exception:
230
  pass
231
-
232
- # Force torch to see the correct device after setting CUDA_VISIBLE_DEVICES
233
- # This ensures vLLM's device detection works correctly
 
234
  if torch.cuda.is_available():
235
- # Verify device is accessible
236
  device_name = torch.cuda.get_device_name(0)
237
  print(f" β†’ Verified CUDA device accessible: {device_name}")
238
- # Explicitly set default device to ensure vLLM can detect it
239
  torch.cuda.set_device(0)
240
- print(f" β†’ Set torch.cuda default device to 0")
241
-
 
 
 
 
242
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
243
  if quantization == "awq":
244
  llm_kwargs["quantization"] = "awq"
@@ -265,6 +261,9 @@ def load_vllm_model(model_name: str):
265
  except Exception:
266
  print(f" β†’ FP8 quantization not available, falling back to bf16")
267
 
 
 
 
268
  print(f" β†’ Loading with vLLM (continuous batching, PagedAttention)...")
269
  llm = LLM(**llm_kwargs)
270
  VLLM_MODELS[model_name] = llm
 
14
  # Enable optimizations
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
+ # ZeroGPU often exposes MIG UUIDs; keep them unless the variable is empty
 
 
18
  if torch.cuda.is_available():
 
19
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
20
+ if not cuda_visible:
 
21
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
22
+ cuda_visible = "0"
23
  print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
24
+ print(f"CUDA_VISIBLE_DEVICES: {cuda_visible or os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
25
  else:
26
  print("WARNING: CUDA not available - vLLM will not work")
27
 
 
210
  }
211
 
212
  # Ensure CUDA_VISIBLE_DEVICES is set correctly for vLLM device detection
213
+ # ZeroGPU exposes MIG UUIDs; keep them unless the variable is empty
 
 
214
  cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "")
215
+ if not cuda_visible:
 
216
  os.environ["CUDA_VISIBLE_DEVICES"] = "0"
217
+ cuda_visible = "0"
218
+ print(" β†’ CUDA_VISIBLE_DEVICES was empty, set to 0")
 
219
  try:
220
  if hasattr(torch.cuda, '_lazy_init'):
221
  torch.cuda._lazy_init()
222
  except Exception:
223
  pass
224
+ else:
225
+ print(f" β†’ CUDA_VISIBLE_DEVICES retained: {cuda_visible}")
226
+
227
+ # Force torch to see the correct device after ensuring CUDA_VISIBLE_DEVICES
228
  if torch.cuda.is_available():
 
229
  device_name = torch.cuda.get_device_name(0)
230
  print(f" β†’ Verified CUDA device accessible: {device_name}")
 
231
  torch.cuda.set_device(0)
232
+ print(" β†’ Set torch.cuda default device to 0")
233
+
234
+ # Disable Ray executor on ZeroGPU to simplify device handling
235
+ os.environ.setdefault("VLLM_USE_RAY", "0")
236
+ os.environ.setdefault("VLLM_WORKER_USE_RAY", "0")
237
+
238
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
239
  if quantization == "awq":
240
  llm_kwargs["quantization"] = "awq"
 
261
  except Exception:
262
  print(f" β†’ FP8 quantization not available, falling back to bf16")
263
 
264
+ # Explicitly select CUDA device and single-process executor
265
+ llm_kwargs["device"] = "cuda" if torch.cuda.is_available() else "cpu"
266
+
267
  print(f" β†’ Loading with vLLM (continuous batching, PagedAttention)...")
268
  llm = LLM(**llm_kwargs)
269
  VLLM_MODELS[model_name] = llm