msradam commited on
Commit
3034360
Β·
verified Β·
1 Parent(s): 51e6b76

fix(vllm): only pass documents via chat_template_kwargs, not top-level

Browse files
Files changed (1) hide show
  1. app/llm.py +18 -13
app/llm.py CHANGED
@@ -428,20 +428,25 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
428
  kwargs = _opts_to_kwargs(options)
429
  docs = _extract_documents(messages)
430
  if docs:
431
- # Merge into extra_body so Granite's HF chat template (vLLM)
432
- # picks them up. Ollama backend ignores extra_body and keeps
433
- # using the role="document <id>" messages already in `messages`.
434
  eb = kwargs.setdefault("extra_body", {})
435
- eb["documents"] = docs
436
- eb.setdefault("chat_template_kwargs", {})["documents"] = docs
437
- # vLLM's Granite HF chat template reads documents from
438
- # chat_template_kwargs (set above). The `role="document <id>"` entries
439
- # are Ollama-specific; vLLM's Jinja template doesn't recognise
440
- # non-standard roles and returns 400 immediately, causing LiteLLM to
441
- # fall back to Ollama (5 s CPU timeout β†’ empty paragraph).
442
- # Strip them so vLLM gets only user/system/assistant messages.
443
- # The Ollama fallback receives the same filtered messages, which is
444
- # acceptable β€” the Ollama path is a 5 s safety net when vLLM is up.
 
 
 
 
 
 
 
 
445
  effective_messages = (
446
  [m for m in messages if not m.get("role", "").startswith("document ")]
447
  if docs and _PRIMARY == "vllm" and _VLLM_BASE
 
428
  kwargs = _opts_to_kwargs(options)
429
  docs = _extract_documents(messages)
430
  if docs:
 
 
 
431
  eb = kwargs.setdefault("extra_body", {})
432
+ if _PRIMARY == "vllm" and _VLLM_BASE:
433
+ # vLLM's Granite HF chat template reads documents from
434
+ # chat_template_kwargs.documents only. Sending them ALSO as a
435
+ # top-level "documents" key causes vLLM to inject them twice
436
+ # (once via the template kwarg path, once via vLLM's own
437
+ # grounding handler), doubling the token count and blowing
438
+ # past max_model_len=2352 β†’ 400 β†’ LiteLLM Ollama fallback β†’
439
+ # 5 s CPU timeout β†’ empty paragraph.
440
+ eb.setdefault("chat_template_kwargs", {})["documents"] = docs
441
+ else:
442
+ # Ollama backend: role="document <id>" messages in `messages`
443
+ # are the primary document path. The extra_body fields are
444
+ # forwarded as-is for Ollama's own handling.
445
+ eb["documents"] = docs
446
+ eb.setdefault("chat_template_kwargs", {})["documents"] = docs
447
+ # For vLLM, strip the role="document <id>" messages from the messages
448
+ # array β€” vLLM's Jinja template doesn't recognise non-standard roles
449
+ # (raises 400). Documents are already in chat_template_kwargs above.
450
  effective_messages = (
451
  [m for m in messages if not m.get("role", "").startswith("document ")]
452
  if docs and _PRIMARY == "vllm" and _VLLM_BASE