tao-shen Claude Opus 4.6 commited on
Commit
d2dcafa
·
1 Parent(s): 5782191

fix: add smart-wait timeout to prevent infinite APP_STARTING polling

Browse files

After 15 polls (~5 min) with no stage change, smart-wait exits and
resumes normal agent turns so they can diagnose why Cain is stuck.
Previously, APP_STARTING could block agents indefinitely (observed
34+ polls / 11+ minutes with no escape).

Counter resets on any stage change.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. scripts/conversation-loop.py +26 -16
scripts/conversation-loop.py CHANGED
@@ -1302,25 +1302,35 @@ if reply:
1302
 
1303
  time.sleep(20)
1304
 
 
 
 
1305
  while True:
1306
  # Smart wait: if Cain is BUILDING/APP_STARTING, skip LLM calls and just poll
1307
  if child_state["stage"] in ("BUILDING", "RESTARTING", "APP_STARTING"):
1308
- print(f"[WAIT] Cain is {child_state['stage']} — polling health instead of LLM call...")
1309
- check_and_clear_cooldown()
1310
- # Quick health check to update stage
1311
- try:
1312
- info = hf_api.space_info(CHILD_SPACE_ID)
1313
- new_stage = info.runtime.stage if info.runtime else "unknown"
1314
- if new_stage != child_state["stage"]:
1315
- print(f"[WAIT] Stage changed: {child_state['stage']} → {new_stage}")
1316
- child_state["stage"] = new_stage
1317
- child_state["alive"] = (new_stage == "RUNNING")
1318
- else:
1319
- print(f"[WAIT] Still {new_stage}... waiting 20s")
1320
- except Exception as e:
1321
- print(f"[WAIT] Health check error: {e}")
1322
- time.sleep(20)
1323
- continue
 
 
 
 
 
 
 
1324
 
1325
  do_turn("Eve", "Adam", EVE_SPACE)
1326
  time.sleep(20) # longer pause — each turn does more work now
 
1302
 
1303
  time.sleep(20)
1304
 
1305
+ smart_wait_count = 0
1306
+ MAX_SMART_WAIT_POLLS = 15 # ~5 min max wait, then let agents diagnose
1307
+
1308
  while True:
1309
  # Smart wait: if Cain is BUILDING/APP_STARTING, skip LLM calls and just poll
1310
  if child_state["stage"] in ("BUILDING", "RESTARTING", "APP_STARTING"):
1311
+ smart_wait_count += 1
1312
+ if smart_wait_count > MAX_SMART_WAIT_POLLS:
1313
+ print(f"[WAIT-TIMEOUT] {smart_wait_count} polls (~{smart_wait_count*20}s) on {child_state['stage']} — resuming agent turns to diagnose")
1314
+ smart_wait_count = 0
1315
+ # Fall through to normal agent turns
1316
+ else:
1317
+ print(f"[WAIT] Cain is {child_state['stage']} — polling health instead of LLM call... ({smart_wait_count}/{MAX_SMART_WAIT_POLLS})")
1318
+ check_and_clear_cooldown()
1319
+ # Quick health check to update stage
1320
+ try:
1321
+ info = hf_api.space_info(CHILD_SPACE_ID)
1322
+ new_stage = info.runtime.stage if info.runtime else "unknown"
1323
+ if new_stage != child_state["stage"]:
1324
+ print(f"[WAIT] Stage changed: {child_state['stage']} → {new_stage}")
1325
+ child_state["stage"] = new_stage
1326
+ child_state["alive"] = (new_stage == "RUNNING")
1327
+ smart_wait_count = 0 # reset on stage change
1328
+ else:
1329
+ print(f"[WAIT] Still {new_stage}... waiting 20s")
1330
+ except Exception as e:
1331
+ print(f"[WAIT] Health check error: {e}")
1332
+ time.sleep(20)
1333
+ continue
1334
 
1335
  do_turn("Eve", "Adam", EVE_SPACE)
1336
  time.sleep(20) # longer pause — each turn does more work now