msradam commited on
Commit
80deb38
·
verified ·
1 Parent(s): c92bd29

fix(vllm): strip document-role messages before sending to vLLM

Browse files
Files changed (1) hide show
  1. app/llm.py +15 -2
app/llm.py CHANGED
@@ -434,6 +434,19 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
434
  eb = kwargs.setdefault("extra_body", {})
435
  eb["documents"] = docs
436
  eb.setdefault("chat_template_kwargs", {})["documents"] = docs
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  if format == "json":
438
  # OpenAI/vLLM path
439
  kwargs["response_format"] = {"type": "json_object"}
@@ -446,7 +459,7 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
446
  p0 = _sample_gpu_power_w()
447
  t0 = time.monotonic()
448
  if stream:
449
- s = _router.completion(model=alias, messages=messages,
450
  stream=True, **kwargs)
451
 
452
  def _on_stream_done(full_text: str) -> None:
@@ -459,7 +472,7 @@ def chat(model: str, messages: list[dict], options: dict | None = None,
459
  avg_power_w=avg)
460
 
461
  return _stream_to_ollama_shape(s, on_done=_on_stream_done)
462
- resp = _router.completion(model=alias, messages=messages, **kwargs)
463
  duration_s = time.monotonic() - t0
464
  p1 = _sample_gpu_power_w()
465
  avg = _avg_w(p0, p1)
 
434
  eb = kwargs.setdefault("extra_body", {})
435
  eb["documents"] = docs
436
  eb.setdefault("chat_template_kwargs", {})["documents"] = docs
437
+ # vLLM's Granite HF chat template reads documents from
438
+ # chat_template_kwargs (set above). The `role="document <id>"` entries
439
+ # are Ollama-specific; vLLM's Jinja template doesn't recognise
440
+ # non-standard roles and returns 400 immediately, causing LiteLLM to
441
+ # fall back to Ollama (5 s CPU timeout → empty paragraph).
442
+ # Strip them so vLLM gets only user/system/assistant messages.
443
+ # The Ollama fallback receives the same filtered messages, which is
444
+ # acceptable — the Ollama path is a 5 s safety net when vLLM is up.
445
+ effective_messages = (
446
+ [m for m in messages if not m.get("role", "").startswith("document ")]
447
+ if docs and _PRIMARY == "vllm" and _VLLM_BASE
448
+ else messages
449
+ )
450
  if format == "json":
451
  # OpenAI/vLLM path
452
  kwargs["response_format"] = {"type": "json_object"}
 
459
  p0 = _sample_gpu_power_w()
460
  t0 = time.monotonic()
461
  if stream:
462
+ s = _router.completion(model=alias, messages=effective_messages,
463
  stream=True, **kwargs)
464
 
465
  def _on_stream_done(full_text: str) -> None:
 
472
  avg_power_w=avg)
473
 
474
  return _stream_to_ollama_shape(s, on_done=_on_stream_done)
475
+ resp = _router.completion(model=alias, messages=effective_messages, **kwargs)
476
  duration_s = time.monotonic() - t0
477
  p1 = _sample_gpu_power_w()
478
  avg = _avg_w(p0, p1)