Alikestocode commited on
Commit
2ddfeca
·
1 Parent(s): b4fd5e9

Fix vLLM device detection for ZeroGPU

Browse files

- Set CUDA_VISIBLE_DEVICES environment variable for vLLM
- Add CUDA detection logging at module level
- Ensure CUDA is available before attempting vLLM load
- Improve error messages for device detection issues

Files changed (1) hide show
  1. app.py +25 -0
app.py CHANGED
@@ -14,6 +14,17 @@ from threading import Thread
14
  # Enable optimizations
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Try to import vLLM (primary inference engine)
18
  try:
19
  from vllm import LLM, SamplingParams
@@ -136,9 +147,19 @@ def load_vllm_model(model_name: str):
136
  print(f"Loading {repo} with vLLM (quantization: {quantization})...")
137
 
138
  try:
 
 
 
 
 
 
 
 
 
139
  # vLLM configuration optimized for ZeroGPU H200 slice
140
  # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
141
  # Note: HF_TOKEN is passed via environment variable, not as a parameter
 
142
  llm_kwargs = {
143
  "model": repo,
144
  "trust_remote_code": True,
@@ -151,6 +172,10 @@ def load_vllm_model(model_name: str):
151
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
152
  }
153
 
 
 
 
 
154
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
155
  if quantization == "awq":
156
  llm_kwargs["quantization"] = "awq"
 
14
  # Enable optimizations
15
  torch.backends.cuda.matmul.allow_tf32 = True
16
 
17
+ # Ensure CUDA is visible to vLLM on ZeroGPU
18
+ # vLLM needs explicit CUDA device configuration
19
+ if torch.cuda.is_available():
20
+ # Set CUDA_VISIBLE_DEVICES if not already set (helps vLLM detect GPU)
21
+ if "CUDA_VISIBLE_DEVICES" not in os.environ:
22
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
23
+ print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
24
+ print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
25
+ else:
26
+ print("WARNING: CUDA not available - vLLM will not work")
27
+
28
  # Try to import vLLM (primary inference engine)
29
  try:
30
  from vllm import LLM, SamplingParams
 
147
  print(f"Loading {repo} with vLLM (quantization: {quantization})...")
148
 
149
  try:
150
+ # Detect device explicitly for vLLM
151
+ # vLLM needs explicit device configuration on ZeroGPU
152
+ if not torch.cuda.is_available():
153
+ raise RuntimeError("CUDA not available - vLLM requires GPU. Falling back to Transformers pipeline.")
154
+
155
+ print(f" → CUDA available: {torch.cuda.get_device_name(0)}")
156
+ print(f" → CUDA device count: {torch.cuda.device_count()}")
157
+ print(f" → CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
158
+
159
  # vLLM configuration optimized for ZeroGPU H200 slice
160
  # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
161
  # Note: HF_TOKEN is passed via environment variable, not as a parameter
162
+ # vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
163
  llm_kwargs = {
164
  "model": repo,
165
  "trust_remote_code": True,
 
172
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
173
  }
174
 
175
+ # Ensure CUDA_VISIBLE_DEVICES is set for vLLM device detection
176
+ if "CUDA_VISIBLE_DEVICES" not in os.environ:
177
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
178
+
179
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
180
  if quantization == "awq":
181
  llm_kwargs["quantization"] = "awq"