Spaces:
Paused
Paused
Claude Code commited on
Commit ·
543ee28
1
Parent(s): 26b6531
god: Fix alive field - don't let /api/state override HF API stage
Browse files- action_check_health() and init_child_state() were incorrectly setting
alive=True and stage=RUNNING when /api/state endpoint responded
- This masked RUNTIME_ERROR state because the container still responds
on /api/state even when app has crashed
- HF API space_info().runtime.stage is the source of truth for stage/alive
- Health check should only return status string, not modify state
- scripts/conversation-loop.py +10 -18
scripts/conversation-loop.py
CHANGED
|
@@ -185,16 +185,8 @@ def init_child_state():
|
|
| 185 |
info = hf_api.space_info(CHILD_SPACE_ID)
|
| 186 |
child_state["created"] = True
|
| 187 |
child_state["stage"] = info.runtime.stage if info.runtime else "unknown"
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
if resp.ok:
|
| 191 |
-
data = resp.json()
|
| 192 |
-
child_state["alive"] = True
|
| 193 |
-
child_state["state"] = data.get("state", "unknown")
|
| 194 |
-
child_state["detail"] = data.get("detail", "")
|
| 195 |
-
child_state["stage"] = "RUNNING"
|
| 196 |
-
except:
|
| 197 |
-
child_state["alive"] = (child_state["stage"] == "RUNNING")
|
| 198 |
print(f"[init] {CHILD_NAME}: stage={child_state['stage']}, alive={child_state['alive']}")
|
| 199 |
except:
|
| 200 |
print(f"[init] {CHILD_NAME} does not exist yet")
|
|
@@ -237,25 +229,25 @@ def action_create_child():
|
|
| 237 |
|
| 238 |
|
| 239 |
def action_check_health():
|
| 240 |
-
"""Check Cain's health with detailed error info."""
|
| 241 |
if not child_state["created"]:
|
| 242 |
return f"{CHILD_NAME} not born yet."
|
|
|
|
| 243 |
try:
|
| 244 |
resp = requests.get(f"{CHILD_SPACE_URL}/api/state", timeout=10)
|
| 245 |
if resp.ok:
|
| 246 |
data = resp.json()
|
| 247 |
-
child_state
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
return f"{CHILD_NAME} is ALIVE! State: {child_state['state']}, Detail: {child_state['detail'] or 'healthy'}"
|
| 252 |
except:
|
| 253 |
pass
|
|
|
|
| 254 |
try:
|
| 255 |
info = hf_api.space_info(CHILD_SPACE_ID)
|
| 256 |
stage = info.runtime.stage if info.runtime else "NO_RUNTIME"
|
| 257 |
-
child_state
|
| 258 |
-
child_state["alive"] = (stage == "RUNNING")
|
| 259 |
if stage in ("RUNTIME_ERROR", "BUILD_ERROR", "CONFIG_ERROR", "RUNNING"):
|
| 260 |
error_detail = ""
|
| 261 |
try:
|
|
|
|
| 185 |
info = hf_api.space_info(CHILD_SPACE_ID)
|
| 186 |
child_state["created"] = True
|
| 187 |
child_state["stage"] = info.runtime.stage if info.runtime else "unknown"
|
| 188 |
+
# Use HF API stage as source of truth for alive (stage==RUNNING means healthy)
|
| 189 |
+
child_state["alive"] = (child_state["stage"] == "RUNNING")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
print(f"[init] {CHILD_NAME}: stage={child_state['stage']}, alive={child_state['alive']}")
|
| 191 |
except:
|
| 192 |
print(f"[init] {CHILD_NAME} does not exist yet")
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
def action_check_health():
|
| 232 |
+
"""Check Cain's health with detailed error info. Returns status string, does NOT modify child_state."""
|
| 233 |
if not child_state["created"]:
|
| 234 |
return f"{CHILD_NAME} not born yet."
|
| 235 |
+
# Try /api/state endpoint for app-level health (returns app state like "ready", "error")
|
| 236 |
try:
|
| 237 |
resp = requests.get(f"{CHILD_SPACE_URL}/api/state", timeout=10)
|
| 238 |
if resp.ok:
|
| 239 |
data = resp.json()
|
| 240 |
+
# DO NOT modify child_state here - only main loop should update stage/alive from HF API
|
| 241 |
+
app_state = data.get("state", "unknown")
|
| 242 |
+
app_detail = data.get("detail", "")
|
| 243 |
+
return f"{CHILD_NAME} app endpoint responds. State: {app_state}, Detail: {app_detail or 'healthy'}"
|
|
|
|
| 244 |
except:
|
| 245 |
pass
|
| 246 |
+
# Fall back to HF API for runtime stage (source of truth for stage/alive)
|
| 247 |
try:
|
| 248 |
info = hf_api.space_info(CHILD_SPACE_ID)
|
| 249 |
stage = info.runtime.stage if info.runtime else "NO_RUNTIME"
|
| 250 |
+
# DO NOT modify child_state here - main loop handles that
|
|
|
|
| 251 |
if stage in ("RUNTIME_ERROR", "BUILD_ERROR", "CONFIG_ERROR", "RUNNING"):
|
| 252 |
error_detail = ""
|
| 253 |
try:
|