Claude Code commited on
Commit
543ee28
·
1 Parent(s): 26b6531

god: Fix alive field - don't let /api/state override HF API stage

Browse files

- action_check_health() and init_child_state() were incorrectly setting
alive=True and stage=RUNNING when /api/state endpoint responded
- This masked RUNTIME_ERROR state because the container still responds
on /api/state even when app has crashed
- HF API space_info().runtime.stage is the source of truth for stage/alive
- Health check should only return status string, not modify state

Files changed (1) hide show
  1. scripts/conversation-loop.py +10 -18
scripts/conversation-loop.py CHANGED
@@ -185,16 +185,8 @@ def init_child_state():
185
  info = hf_api.space_info(CHILD_SPACE_ID)
186
  child_state["created"] = True
187
  child_state["stage"] = info.runtime.stage if info.runtime else "unknown"
188
- try:
189
- resp = requests.get(f"{CHILD_SPACE_URL}/api/state", timeout=10)
190
- if resp.ok:
191
- data = resp.json()
192
- child_state["alive"] = True
193
- child_state["state"] = data.get("state", "unknown")
194
- child_state["detail"] = data.get("detail", "")
195
- child_state["stage"] = "RUNNING"
196
- except:
197
- child_state["alive"] = (child_state["stage"] == "RUNNING")
198
  print(f"[init] {CHILD_NAME}: stage={child_state['stage']}, alive={child_state['alive']}")
199
  except:
200
  print(f"[init] {CHILD_NAME} does not exist yet")
@@ -237,25 +229,25 @@ def action_create_child():
237
 
238
 
239
  def action_check_health():
240
- """Check Cain's health with detailed error info."""
241
  if not child_state["created"]:
242
  return f"{CHILD_NAME} not born yet."
 
243
  try:
244
  resp = requests.get(f"{CHILD_SPACE_URL}/api/state", timeout=10)
245
  if resp.ok:
246
  data = resp.json()
247
- child_state["alive"] = True
248
- child_state["state"] = data.get("state", "unknown")
249
- child_state["detail"] = data.get("detail", "")
250
- child_state["stage"] = "RUNNING"
251
- return f"{CHILD_NAME} is ALIVE! State: {child_state['state']}, Detail: {child_state['detail'] or 'healthy'}"
252
  except:
253
  pass
 
254
  try:
255
  info = hf_api.space_info(CHILD_SPACE_ID)
256
  stage = info.runtime.stage if info.runtime else "NO_RUNTIME"
257
- child_state["stage"] = stage
258
- child_state["alive"] = (stage == "RUNNING")
259
  if stage in ("RUNTIME_ERROR", "BUILD_ERROR", "CONFIG_ERROR", "RUNNING"):
260
  error_detail = ""
261
  try:
 
185
  info = hf_api.space_info(CHILD_SPACE_ID)
186
  child_state["created"] = True
187
  child_state["stage"] = info.runtime.stage if info.runtime else "unknown"
188
+ # Use HF API stage as source of truth for alive (stage==RUNNING means healthy)
189
+ child_state["alive"] = (child_state["stage"] == "RUNNING")
 
 
 
 
 
 
 
 
190
  print(f"[init] {CHILD_NAME}: stage={child_state['stage']}, alive={child_state['alive']}")
191
  except:
192
  print(f"[init] {CHILD_NAME} does not exist yet")
 
229
 
230
 
231
  def action_check_health():
232
+ """Check Cain's health with detailed error info. Returns status string, does NOT modify child_state."""
233
  if not child_state["created"]:
234
  return f"{CHILD_NAME} not born yet."
235
+ # Try /api/state endpoint for app-level health (returns app state like "ready", "error")
236
  try:
237
  resp = requests.get(f"{CHILD_SPACE_URL}/api/state", timeout=10)
238
  if resp.ok:
239
  data = resp.json()
240
+ # DO NOT modify child_state here - only main loop should update stage/alive from HF API
241
+ app_state = data.get("state", "unknown")
242
+ app_detail = data.get("detail", "")
243
+ return f"{CHILD_NAME} app endpoint responds. State: {app_state}, Detail: {app_detail or 'healthy'}"
 
244
  except:
245
  pass
246
+ # Fall back to HF API for runtime stage (source of truth for stage/alive)
247
  try:
248
  info = hf_api.space_info(CHILD_SPACE_ID)
249
  stage = info.runtime.stage if info.runtime else "NO_RUNTIME"
250
+ # DO NOT modify child_state here - main loop handles that
 
251
  if stage in ("RUNTIME_ERROR", "BUILD_ERROR", "CONFIG_ERROR", "RUNNING"):
252
  error_detail = ""
253
  try: