fix(vllm): strip document-role messages before sending to vLLM
Browse files- app/llm.py +15 -2
app/llm.py
CHANGED
|
@@ -434,6 +434,19 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
|
|
| 434 |
eb = kwargs.setdefault("extra_body", {})
|
| 435 |
eb["documents"] = docs
|
| 436 |
eb.setdefault("chat_template_kwargs", {})["documents"] = docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
if format == "json":
|
| 438 |
# OpenAI/vLLM path
|
| 439 |
kwargs["response_format"] = {"type": "json_object"}
|
|
@@ -446,7 +459,7 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
|
|
| 446 |
p0 = _sample_gpu_power_w()
|
| 447 |
t0 = time.monotonic()
|
| 448 |
if stream:
|
| 449 |
-
s = _router.completion(model=alias, messages=
|
| 450 |
stream=True, **kwargs)
|
| 451 |
|
| 452 |
def _on_stream_done(full_text: str) -> None:
|
|
@@ -459,7 +472,7 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
|
|
| 459 |
avg_power_w=avg)
|
| 460 |
|
| 461 |
return _stream_to_ollama_shape(s, on_done=_on_stream_done)
|
| 462 |
-
resp = _router.completion(model=alias, messages=
|
| 463 |
duration_s = time.monotonic() - t0
|
| 464 |
p1 = _sample_gpu_power_w()
|
| 465 |
avg = _avg_w(p0, p1)
|
|
|
|
| 434 |
eb = kwargs.setdefault("extra_body", {})
|
| 435 |
eb["documents"] = docs
|
| 436 |
eb.setdefault("chat_template_kwargs", {})["documents"] = docs
|
| 437 |
+
# vLLM's Granite HF chat template reads documents from
|
| 438 |
+
# chat_template_kwargs (set above). The `role="document <id>"` entries
|
| 439 |
+
# are Ollama-specific; vLLM's Jinja template doesn't recognise
|
| 440 |
+
# non-standard roles and returns 400 immediately, causing LiteLLM to
|
| 441 |
+
# fall back to Ollama (5 s CPU timeout → empty paragraph).
|
| 442 |
+
# Strip them so vLLM gets only user/system/assistant messages.
|
| 443 |
+
# The Ollama fallback receives the same filtered messages, which is
|
| 444 |
+
# acceptable — the Ollama path is a 5 s safety net when vLLM is up.
|
| 445 |
+
effective_messages = (
|
| 446 |
+
[m for m in messages if not m.get("role", "").startswith("document ")]
|
| 447 |
+
if docs and _PRIMARY == "vllm" and _VLLM_BASE
|
| 448 |
+
else messages
|
| 449 |
+
)
|
| 450 |
if format == "json":
|
| 451 |
# OpenAI/vLLM path
|
| 452 |
kwargs["response_format"] = {"type": "json_object"}
|
|
|
|
| 459 |
p0 = _sample_gpu_power_w()
|
| 460 |
t0 = time.monotonic()
|
| 461 |
if stream:
|
| 462 |
+
s = _router.completion(model=alias, messages=effective_messages,
|
| 463 |
stream=True, **kwargs)
|
| 464 |
|
| 465 |
def _on_stream_done(full_text: str) -> None:
|
|
|
|
| 472 |
avg_power_w=avg)
|
| 473 |
|
| 474 |
return _stream_to_ollama_shape(s, on_done=_on_stream_done)
|
| 475 |
+
resp = _router.completion(model=alias, messages=effective_messages, **kwargs)
|
| 476 |
duration_s = time.monotonic() - t0
|
| 477 |
p1 = _sample_gpu_power_w()
|
| 478 |
avg = _avg_w(p0, p1)
|