Spaces:
Running
Running
fix: add smart-wait timeout to prevent infinite APP_STARTING polling
Browse filesAfter 15 polls (~5 min) with no stage change, smart-wait exits and
resumes normal agent turns so they can diagnose why Cain is stuck.
Previously, APP_STARTING could block agents indefinitely (observed
34+ polls / 11+ minutes with no escape).
Counter resets on any stage change.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- scripts/conversation-loop.py +26 -16
scripts/conversation-loop.py
CHANGED
|
@@ -1302,25 +1302,35 @@ if reply:
|
|
| 1302 |
|
| 1303 |
time.sleep(20)
|
| 1304 |
|
|
|
|
|
|
|
|
|
|
| 1305 |
while True:
|
| 1306 |
# Smart wait: if Cain is BUILDING/APP_STARTING, skip LLM calls and just poll
|
| 1307 |
if child_state["stage"] in ("BUILDING", "RESTARTING", "APP_STARTING"):
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
-
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
-
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1324 |
|
| 1325 |
do_turn("Eve", "Adam", EVE_SPACE)
|
| 1326 |
time.sleep(20) # longer pause — each turn does more work now
|
|
|
|
| 1302 |
|
| 1303 |
time.sleep(20)
|
| 1304 |
|
| 1305 |
+
smart_wait_count = 0
|
| 1306 |
+
MAX_SMART_WAIT_POLLS = 15 # ~5 min max wait, then let agents diagnose
|
| 1307 |
+
|
| 1308 |
while True:
|
| 1309 |
# Smart wait: if Cain is BUILDING/APP_STARTING, skip LLM calls and just poll
|
| 1310 |
if child_state["stage"] in ("BUILDING", "RESTARTING", "APP_STARTING"):
|
| 1311 |
+
smart_wait_count += 1
|
| 1312 |
+
if smart_wait_count > MAX_SMART_WAIT_POLLS:
|
| 1313 |
+
print(f"[WAIT-TIMEOUT] {smart_wait_count} polls (~{smart_wait_count*20}s) on {child_state['stage']} — resuming agent turns to diagnose")
|
| 1314 |
+
smart_wait_count = 0
|
| 1315 |
+
# Fall through to normal agent turns
|
| 1316 |
+
else:
|
| 1317 |
+
print(f"[WAIT] Cain is {child_state['stage']} — polling health instead of LLM call... ({smart_wait_count}/{MAX_SMART_WAIT_POLLS})")
|
| 1318 |
+
check_and_clear_cooldown()
|
| 1319 |
+
# Quick health check to update stage
|
| 1320 |
+
try:
|
| 1321 |
+
info = hf_api.space_info(CHILD_SPACE_ID)
|
| 1322 |
+
new_stage = info.runtime.stage if info.runtime else "unknown"
|
| 1323 |
+
if new_stage != child_state["stage"]:
|
| 1324 |
+
print(f"[WAIT] Stage changed: {child_state['stage']} → {new_stage}")
|
| 1325 |
+
child_state["stage"] = new_stage
|
| 1326 |
+
child_state["alive"] = (new_stage == "RUNNING")
|
| 1327 |
+
smart_wait_count = 0 # reset on stage change
|
| 1328 |
+
else:
|
| 1329 |
+
print(f"[WAIT] Still {new_stage}... waiting 20s")
|
| 1330 |
+
except Exception as e:
|
| 1331 |
+
print(f"[WAIT] Health check error: {e}")
|
| 1332 |
+
time.sleep(20)
|
| 1333 |
+
continue
|
| 1334 |
|
| 1335 |
do_turn("Eve", "Adam", EVE_SPACE)
|
| 1336 |
time.sleep(20) # longer pause — each turn does more work now
|