msradam commited on
Commit
ad05fd2
·
verified ·
1 Parent(s): 602bc83

fix(warmup): fire vLLM warmup before planner so RunPod loads during planner+stones

Browse files
Files changed (1) hide show
  1. app/fsm.py +0 -18
app/fsm.py CHANGED
@@ -1550,24 +1550,6 @@ def iter_steps(query: str):
1550
  app = build_app(query, step_queue=q)
1551
  final_state_holder: dict[str, Any] = {}
1552
 
1553
- # Fire a tiny warmup request to the LLM backend immediately so that a
1554
- # cold RunPod pod starts loading the model while the ~35s specialist
1555
- # pipeline runs. By the time step_reconcile starts, the GPU should be
1556
- # warm and the first token arrives quickly.
1557
- def _warmup_llm():
1558
- try:
1559
- from app import llm as _llm
1560
- _llm.chat(
1561
- model="granite-8b",
1562
- messages=[{"role": "user", "content": "hi"}],
1563
- options={"num_predict": 1, "temperature": 0},
1564
- stream=False,
1565
- )
1566
- except Exception:
1567
- pass # warmup failure is silent — reconciler will still try
1568
-
1569
- _threading.Thread(target=_warmup_llm, daemon=True, name="riprap-warmup").start()
1570
-
1571
  # Threadlocals are per-thread; the request thread (single_address.run
1572
  # / neighborhood.run) sets the strict-mode flag, planner specialist
1573
  # set, and token / Mellea-attempt callbacks, but Burr's app.iterate
 
1550
  app = build_app(query, step_queue=q)
1551
  final_state_holder: dict[str, Any] = {}
1552
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1553
  # Threadlocals are per-thread; the request thread (single_address.run
1554
  # / neighborhood.run) sets the strict-mode flag, planner specialist
1555
  # set, and token / Mellea-attempt callbacks, but Burr's app.iterate