fix(vllm): reduce num_predict 512→350 to stay under max_model_len=2352
Browse files- app/mellea_validator.py +5 -4
app/mellea_validator.py
CHANGED
|
@@ -357,14 +357,15 @@ def reconcile_strict_streaming(
|
|
| 357 |
{"role": "system", "content": system_prompt},
|
| 358 |
{"role": "user", "content": user_prompt},
|
| 359 |
]
|
| 360 |
-
# num_predict
|
| 361 |
-
#
|
| 362 |
-
#
|
|
|
|
| 363 |
# Override with RIPRAP_MELLEA_NUM_PREDICT if needed.
|
| 364 |
# num_ctx (Ollama only) is forwarded via extra_body; vLLM ignores it.
|
| 365 |
base_opts = {"temperature": 0,
|
| 366 |
"num_ctx": int(os.environ.get("RIPRAP_MELLEA_NUM_CTX", "4096")),
|
| 367 |
-
"num_predict": int(os.environ.get("RIPRAP_MELLEA_NUM_PREDICT", "
|
| 368 |
**(ollama_options or {})}
|
| 369 |
|
| 370 |
paragraph = ""
|
|
|
|
| 357 |
{"role": "system", "content": system_prompt},
|
| 358 |
{"role": "user", "content": user_prompt},
|
| 359 |
]
|
| 360 |
+
# num_predict 350 for the 4-section briefing (typically 250-350 tokens).
|
| 361 |
+
# Lower ceiling (was 512) frees ~160 tokens of input budget, keeping the
|
| 362 |
+
# full prompt (documents + system prompt + 350 output) under
|
| 363 |
+
# max_model_len=2352 for the RunPod vLLM deployment.
|
| 364 |
# Override with RIPRAP_MELLEA_NUM_PREDICT if needed.
|
| 365 |
# num_ctx (Ollama only) is forwarded via extra_body; vLLM ignores it.
|
| 366 |
base_opts = {"temperature": 0,
|
| 367 |
"num_ctx": int(os.environ.get("RIPRAP_MELLEA_NUM_CTX", "4096")),
|
| 368 |
+
"num_predict": int(os.environ.get("RIPRAP_MELLEA_NUM_PREDICT", "350")),
|
| 369 |
**(ollama_options or {})}
|
| 370 |
|
| 371 |
paragraph = ""
|