Spaces:
Paused
Paused
Peter Larnholt
commited on
Commit
·
3356350
1
Parent(s):
4142581
Disable guided decoding to resolve chat completion errors
Browse filesThe vLLM server was returning 500 errors on chat requests due to
guided decoding (outlines) import issues. Since basic chat doesn't
require structured generation, disabled guided decoding entirely
and removed airportsdata dependency.
- app.py +1 -0
- requirements.txt +0 -3
app.py
CHANGED
|
@@ -27,6 +27,7 @@ VLLM_ARGS = [
|
|
| 27 |
"--gpu-memory-utilization", "0.90",
|
| 28 |
"--trust-remote-code",
|
| 29 |
"--disable-log-requests", # reduce log noise
|
|
|
|
| 30 |
]
|
| 31 |
if "AWQ" in MODEL_ID.upper():
|
| 32 |
VLLM_ARGS += ["--quantization", "awq_marlin"] # faster AWQ kernel if available
|
|
|
|
| 27 |
"--gpu-memory-utilization", "0.90",
|
| 28 |
"--trust-remote-code",
|
| 29 |
"--disable-log-requests", # reduce log noise
|
| 30 |
+
"--disable-guided-decoding", # skip guided decoding (outlines) to avoid import issues
|
| 31 |
]
|
| 32 |
if "AWQ" in MODEL_ID.upper():
|
| 33 |
VLLM_ARGS += ["--quantization", "awq_marlin"] # faster AWQ kernel if available
|
requirements.txt
CHANGED
|
@@ -9,6 +9,3 @@ vllm==0.6.3.post1
|
|
| 9 |
torch==2.4.0
|
| 10 |
transformers>=4.44
|
| 11 |
accelerate>=0.30
|
| 12 |
-
|
| 13 |
-
# Required for vLLM guided decoding (even if not actively used)
|
| 14 |
-
airportsdata>=20240400
|
|
|
|
| 9 |
torch==2.4.0
|
| 10 |
transformers>=4.44
|
| 11 |
accelerate>=0.30
|
|
|
|
|
|
|
|
|