msradam commited on
Commit
51e6b76
·
verified ·
1 Parent(s): daf3545

fix(vllm): reduce num_predict 512→350 to stay under max_model_len=2352

Browse files
Files changed (1) hide show
  1. app/mellea_validator.py +5 -4
app/mellea_validator.py CHANGED
@@ -357,14 +357,15 @@ def reconcile_strict_streaming(
357
  {"role": "system", "content": system_prompt},
358
  {"role": "user", "content": user_prompt},
359
  ]
360
- # num_predict 512 lets the 4-section briefing complete in one pass.
361
- # Reconciler prompts run ~1200 tokens (after trim_docs_to_plan),
362
- # so 1200+512=1712 comfortably under the vLLM max_model_len=2352.
 
363
  # Override with RIPRAP_MELLEA_NUM_PREDICT if needed.
364
  # num_ctx (Ollama only) is forwarded via extra_body; vLLM ignores it.
365
  base_opts = {"temperature": 0,
366
  "num_ctx": int(os.environ.get("RIPRAP_MELLEA_NUM_CTX", "4096")),
367
- "num_predict": int(os.environ.get("RIPRAP_MELLEA_NUM_PREDICT", "512")),
368
  **(ollama_options or {})}
369
 
370
  paragraph = ""
 
357
  {"role": "system", "content": system_prompt},
358
  {"role": "user", "content": user_prompt},
359
  ]
360
+ # num_predict 350 for the 4-section briefing (typically 250-350 tokens).
361
+ # Lower ceiling (was 512) frees ~160 tokens of input budget, keeping the
362
+ # full prompt (documents + system prompt + 350 output) under
363
+ # max_model_len=2352 for the RunPod vLLM deployment.
364
  # Override with RIPRAP_MELLEA_NUM_PREDICT if needed.
365
  # num_ctx (Ollama only) is forwarded via extra_body; vLLM ignores it.
366
  base_opts = {"temperature": 0,
367
  "num_ctx": int(os.environ.get("RIPRAP_MELLEA_NUM_CTX", "4096")),
368
+ "num_predict": int(os.environ.get("RIPRAP_MELLEA_NUM_PREDICT", "350")),
369
  **(ollama_options or {})}
370
 
371
  paragraph = ""