Spaces:
Sleeping
Sleeping
Commit
·
f886036
1
Parent(s):
8fd14bc
Remove unsupported vLLM device kwarg
Browse files
app.py
CHANGED
|
@@ -261,9 +261,8 @@ def load_vllm_model(model_name: str):
|
|
| 261 |
except Exception:
|
| 262 |
print(f" → FP8 quantization not available, falling back to bf16")
|
| 263 |
|
| 264 |
-
#
|
| 265 |
-
|
| 266 |
-
|
| 267 |
print(f" → Loading with vLLM (continuous batching, PagedAttention)...")
|
| 268 |
llm = LLM(**llm_kwargs)
|
| 269 |
VLLM_MODELS[model_name] = llm
|
|
|
|
| 261 |
except Exception:
|
| 262 |
print(f" → FP8 quantization not available, falling back to bf16")
|
| 263 |
|
| 264 |
+
# vLLM will now detect the CUDA device via torch / environment settings above
|
| 265 |
+
|
|
|
|
| 266 |
print(f" → Loading with vLLM (continuous batching, PagedAttention)...")
|
| 267 |
llm = LLM(**llm_kwargs)
|
| 268 |
VLLM_MODELS[model_name] = llm
|