Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 10, 2025

Commit

2ddfeca

1 Parent(s): b4fd5e9

Fix vLLM device detection for ZeroGPU

- Set CUDA_VISIBLE_DEVICES environment variable for vLLM
- Add CUDA detection logging at module level
- Ensure CUDA is available before attempting vLLM load
- Improve error messages for device detection issues

Files changed (1) hide show

app.py +25 -0

app.py CHANGED Viewed

@@ -14,6 +14,17 @@ from threading import Thread
 # Enable optimizations
 torch.backends.cuda.matmul.allow_tf32 = True
 # Try to import vLLM (primary inference engine)
 try:
     from vllm import LLM, SamplingParams
@@ -136,9 +147,19 @@ def load_vllm_model(model_name: str):
     print(f"Loading {repo} with vLLM (quantization: {quantization})...")
     try:
         # vLLM configuration optimized for ZeroGPU H200 slice
         # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
         # Note: HF_TOKEN is passed via environment variable, not as a parameter
         llm_kwargs = {
             "model": repo,
             "trust_remote_code": True,
@@ -151,6 +172,10 @@ def load_vllm_model(model_name: str):
             "enable_prefix_caching": True,  # Cache prompts for faster TTFT
         }
         # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
         if quantization == "awq":
             llm_kwargs["quantization"] = "awq"

 # Enable optimizations
 torch.backends.cuda.matmul.allow_tf32 = True
+# Ensure CUDA is visible to vLLM on ZeroGPU
+# vLLM needs explicit CUDA device configuration
+if torch.cuda.is_available():
+    # Set CUDA_VISIBLE_DEVICES if not already set (helps vLLM detect GPU)
+    if "CUDA_VISIBLE_DEVICES" not in os.environ:
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    print(f"CUDA detected: {torch.cuda.get_device_name(0)}")
+    print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
+else:
+    print("WARNING: CUDA not available - vLLM will not work")
 # Try to import vLLM (primary inference engine)
 try:
     from vllm import LLM, SamplingParams
     print(f"Loading {repo} with vLLM (quantization: {quantization})...")
     try:
+        # Detect device explicitly for vLLM
+        # vLLM needs explicit device configuration on ZeroGPU
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA not available - vLLM requires GPU. Falling back to Transformers pipeline.")
+        print(f"  → CUDA available: {torch.cuda.get_device_name(0)}")
+        print(f"  → CUDA device count: {torch.cuda.device_count()}")
+        print(f"  → CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}")
         # vLLM configuration optimized for ZeroGPU H200 slice
         # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
         # Note: HF_TOKEN is passed via environment variable, not as a parameter
+        # vLLM auto-detects CUDA from torch.cuda.is_available() and CUDA_VISIBLE_DEVICES
         llm_kwargs = {
             "model": repo,
             "trust_remote_code": True,
             "enable_prefix_caching": True,  # Cache prompts for faster TTFT
         }
+        # Ensure CUDA_VISIBLE_DEVICES is set for vLLM device detection
+        if "CUDA_VISIBLE_DEVICES" not in os.environ:
+            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
         # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
         if quantization == "awq":
             llm_kwargs["quantization"] = "awq"