fix(warmup): fire vLLM warmup before planner so RunPod loads during planner+stones
Browse files- app/fsm.py +0 -18
app/fsm.py
CHANGED
|
@@ -1550,24 +1550,6 @@ def iter_steps(query: str):
|
|
| 1550 |
app = build_app(query, step_queue=q)
|
| 1551 |
final_state_holder: dict[str, Any] = {}
|
| 1552 |
|
| 1553 |
-
# Fire a tiny warmup request to the LLM backend immediately so that a
|
| 1554 |
-
# cold RunPod pod starts loading the model while the ~35s specialist
|
| 1555 |
-
# pipeline runs. By the time step_reconcile starts, the GPU should be
|
| 1556 |
-
# warm and the first token arrives quickly.
|
| 1557 |
-
def _warmup_llm():
|
| 1558 |
-
try:
|
| 1559 |
-
from app import llm as _llm
|
| 1560 |
-
_llm.chat(
|
| 1561 |
-
model="granite-8b",
|
| 1562 |
-
messages=[{"role": "user", "content": "hi"}],
|
| 1563 |
-
options={"num_predict": 1, "temperature": 0},
|
| 1564 |
-
stream=False,
|
| 1565 |
-
)
|
| 1566 |
-
except Exception:
|
| 1567 |
-
pass # warmup failure is silent — reconciler will still try
|
| 1568 |
-
|
| 1569 |
-
_threading.Thread(target=_warmup_llm, daemon=True, name="riprap-warmup").start()
|
| 1570 |
-
|
| 1571 |
# Threadlocals are per-thread; the request thread (single_address.run
|
| 1572 |
# / neighborhood.run) sets the strict-mode flag, planner specialist
|
| 1573 |
# set, and token / Mellea-attempt callbacks, but Burr's app.iterate
|
|
|
|
| 1550 |
app = build_app(query, step_queue=q)
|
| 1551 |
final_state_holder: dict[str, Any] = {}
|
| 1552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1553 |
# Threadlocals are per-thread; the request thread (single_address.run
|
| 1554 |
# / neighborhood.run) sets the strict-mode flag, planner specialist
|
| 1555 |
# set, and token / Mellea-attempt callbacks, but Burr's app.iterate
|