fix(vllm): only pass documents via chat_template_kwargs, not top-level
Browse files- app/llm.py +18 -13
app/llm.py
CHANGED
|
@@ -428,20 +428,25 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
|
|
| 428 |
kwargs = _opts_to_kwargs(options)
|
| 429 |
docs = _extract_documents(messages)
|
| 430 |
if docs:
|
| 431 |
-
# Merge into extra_body so Granite's HF chat template (vLLM)
|
| 432 |
-
# picks them up. Ollama backend ignores extra_body and keeps
|
| 433 |
-
# using the role="document <id>" messages already in `messages`.
|
| 434 |
eb = kwargs.setdefault("extra_body", {})
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
effective_messages = (
|
| 446 |
[m for m in messages if not m.get("role", "").startswith("document ")]
|
| 447 |
if docs and _PRIMARY == "vllm" and _VLLM_BASE
|
|
|
|
| 428 |
kwargs = _opts_to_kwargs(options)
|
| 429 |
docs = _extract_documents(messages)
|
| 430 |
if docs:
|
|
|
|
|
|
|
|
|
|
| 431 |
eb = kwargs.setdefault("extra_body", {})
|
| 432 |
+
if _PRIMARY == "vllm" and _VLLM_BASE:
|
| 433 |
+
# vLLM's Granite HF chat template reads documents from
|
| 434 |
+
# chat_template_kwargs.documents only. Sending them ALSO as a
|
| 435 |
+
# top-level "documents" key causes vLLM to inject them twice
|
| 436 |
+
# (once via the template kwarg path, once via vLLM's own
|
| 437 |
+
# grounding handler), doubling the token count and blowing
|
| 438 |
+
# past max_model_len=2352 β 400 β LiteLLM Ollama fallback β
|
| 439 |
+
# 5 s CPU timeout β empty paragraph.
|
| 440 |
+
eb.setdefault("chat_template_kwargs", {})["documents"] = docs
|
| 441 |
+
else:
|
| 442 |
+
# Ollama backend: role="document <id>" messages in `messages`
|
| 443 |
+
# are the primary document path. The extra_body fields are
|
| 444 |
+
# forwarded as-is for Ollama's own handling.
|
| 445 |
+
eb["documents"] = docs
|
| 446 |
+
eb.setdefault("chat_template_kwargs", {})["documents"] = docs
|
| 447 |
+
# For vLLM, strip the role="document <id>" messages from the messages
|
| 448 |
+
# array β vLLM's Jinja template doesn't recognise non-standard roles
|
| 449 |
+
# (raises 400). Documents are already in chat_template_kwargs above.
|
| 450 |
effective_messages = (
|
| 451 |
[m for m in messages if not m.get("role", "").startswith("document ")]
|
| 452 |
if docs and _PRIMARY == "vllm" and _VLLM_BASE
|