Alikestocode commited on
Commit
f886036
·
1 Parent(s): 8fd14bc

Remove unsupported vLLM device kwarg

Browse files
Files changed (1) hide show
  1. app.py +2 -3
app.py CHANGED
@@ -261,9 +261,8 @@ def load_vllm_model(model_name: str):
261
  except Exception:
262
  print(f" → FP8 quantization not available, falling back to bf16")
263
 
264
- # Explicitly select CUDA device and single-process executor
265
- llm_kwargs["device"] = "cuda" if torch.cuda.is_available() else "cpu"
266
-
267
  print(f" → Loading with vLLM (continuous batching, PagedAttention)...")
268
  llm = LLM(**llm_kwargs)
269
  VLLM_MODELS[model_name] = llm
 
261
  except Exception:
262
  print(f" → FP8 quantization not available, falling back to bf16")
263
 
264
+ # vLLM will now detect the CUDA device via torch / environment settings above
265
+
 
266
  print(f" → Loading with vLLM (continuous batching, PagedAttention)...")
267
  llm = LLM(**llm_kwargs)
268
  VLLM_MODELS[model_name] = llm