garvitsachdeva Claude Sonnet 4.6 commited on
Commit
52e0424
Β·
1 Parent(s): 0c2a6e7

Fix logs not showing: use file-based state instead of in-memory globals

Browse files

Gradio 5 SSR mode runs event handlers in separate worker processes.
Those workers have their own empty copies of _logs/_status and never
see updates from the training thread in the main process.

Fix: _log() already writes to a file. Now _write_status() persists
phase/done/error to training_status.json. _get_state() reads both
files so it works regardless of which process handles the request.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +38 -21
app.py CHANGED
@@ -24,18 +24,26 @@ import threading
24
  import json, time
25
  import numpy as np
26
 
27
- # ── Shared state ─────────────────────────────────────────────
28
- _logs = []
29
- _status = {"phase": "starting", "done": False, "error": None}
30
- _LOG_FILE = "/home/user/app/assets/training_log.txt"
 
 
 
 
 
 
 
 
31
 
32
 
33
  def _log(msg: str):
34
  ts = time.strftime("%H:%M:%S")
35
  line = f"[{ts}] {msg}"
36
- _logs.append(line)
37
  print(line, flush=True)
38
  try:
 
39
  with open(_LOG_FILE, "a", encoding="utf-8") as f:
40
  f.write(line + "\n")
41
  except Exception:
@@ -253,7 +261,7 @@ def _training_thread():
253
  total_steps = int(cfg.get("training", {}).get("total_timesteps", 500_000))
254
  _log(f"Total steps : {total_steps:,}")
255
  _log("Training started...\n")
256
- _status["phase"] = "training"
257
 
258
  reward_logger = RewardLogger(curriculum=curriculum)
259
  checkpoint_cb = CheckpointCallback(
@@ -283,7 +291,7 @@ def _training_thread():
283
  _log(f"Final curriculum: {curriculum.progress_str()}")
284
 
285
  # ── Reward curve ────────────────────────────────────
286
- _status["phase"] = "saving"
287
  ep_rewards = reward_logger.episode_rewards or [0.0]
288
  episodes = list(range(len(ep_rewards)))
289
  window = max(50, len(ep_rewards) // 20)
@@ -320,7 +328,7 @@ def _training_thread():
320
  _log("Reward curve saved.")
321
 
322
  # ── Push everything to HF Hub ────────────────────────
323
- _status["phase"] = "uploading"
324
  _log(f"Pushing to https://huggingface.co/{HF_REPO} ...")
325
 
326
  ep = reward_logger.episode_rewards
@@ -390,15 +398,13 @@ model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
390
 
391
  _log(f"Uploaded {len(ops)} files.")
392
  _log(f"Model live at: https://huggingface.co/{HF_REPO}")
393
- _status["done"] = True
394
- _status["phase"] = "complete"
395
 
396
  except Exception as exc:
397
  import traceback
398
  _log(f"ERROR: {exc}")
399
  _log(traceback.format_exc())
400
- _status["error"] = str(exc)
401
- _status["phase"] = "error"
402
 
403
 
404
  # ── Start training immediately on Space boot ──────────────────
@@ -408,18 +414,29 @@ _thread.start()
408
 
409
  # ── Gradio UI ─────────────────────────────────────────────────
410
  def _get_state():
411
- phase = _status["phase"]
412
- if _status["done"]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  label = "βœ… Training complete β€” model pushed to HF Hub"
414
- elif _status["error"]:
415
- label = f"❌ Error: {_status['error']}"
416
  else:
417
- icons = {
418
- "starting": "⏳", "training": "πŸ”„",
419
- "saving": "πŸ’Ύ", "uploading": "πŸ“€",
420
- }
421
  label = f"{icons.get(phase, 'πŸ”„')} {phase.capitalize()}..."
422
- return label, "\n".join(_logs[-120:])
423
 
424
 
425
  CSS = """
 
24
  import json, time
25
  import numpy as np
26
 
27
+ # ── Persistent state (file-based so Gradio worker processes can read it) ─────
28
+ _LOG_FILE = "/home/user/app/assets/training_log.txt"
29
+ _STATUS_FILE = "/home/user/app/assets/training_status.json"
30
+
31
+
32
+ def _write_status(phase, done=False, error=None):
33
+ try:
34
+ os.makedirs("/home/user/app/assets", exist_ok=True)
35
+ with open(_STATUS_FILE, "w", encoding="utf-8") as f:
36
+ json.dump({"phase": phase, "done": done, "error": error}, f)
37
+ except Exception:
38
+ pass
39
 
40
 
41
  def _log(msg: str):
42
  ts = time.strftime("%H:%M:%S")
43
  line = f"[{ts}] {msg}"
 
44
  print(line, flush=True)
45
  try:
46
+ os.makedirs("/home/user/app/assets", exist_ok=True)
47
  with open(_LOG_FILE, "a", encoding="utf-8") as f:
48
  f.write(line + "\n")
49
  except Exception:
 
261
  total_steps = int(cfg.get("training", {}).get("total_timesteps", 500_000))
262
  _log(f"Total steps : {total_steps:,}")
263
  _log("Training started...\n")
264
+ _write_status("training")
265
 
266
  reward_logger = RewardLogger(curriculum=curriculum)
267
  checkpoint_cb = CheckpointCallback(
 
291
  _log(f"Final curriculum: {curriculum.progress_str()}")
292
 
293
  # ── Reward curve ────────────────────────────────────
294
+ _write_status("saving")
295
  ep_rewards = reward_logger.episode_rewards or [0.0]
296
  episodes = list(range(len(ep_rewards)))
297
  window = max(50, len(ep_rewards) // 20)
 
328
  _log("Reward curve saved.")
329
 
330
  # ── Push everything to HF Hub ────────────────────────
331
+ _write_status("uploading")
332
  _log(f"Pushing to https://huggingface.co/{HF_REPO} ...")
333
 
334
  ep = reward_logger.episode_rewards
 
398
 
399
  _log(f"Uploaded {len(ops)} files.")
400
  _log(f"Model live at: https://huggingface.co/{HF_REPO}")
401
+ _write_status("complete", done=True)
 
402
 
403
  except Exception as exc:
404
  import traceback
405
  _log(f"ERROR: {exc}")
406
  _log(traceback.format_exc())
407
+ _write_status("error", error=str(exc))
 
408
 
409
 
410
  # ── Start training immediately on Space boot ──────────────────
 
414
 
415
  # ── Gradio UI ─────────────────────────────────────────────────
416
  def _get_state():
417
+ # Read from files β€” works across Gradio worker processes
418
+ try:
419
+ with open(_STATUS_FILE, "r", encoding="utf-8") as f:
420
+ s = json.load(f)
421
+ phase, done, error = s.get("phase", "starting"), s.get("done", False), s.get("error")
422
+ except Exception:
423
+ phase, done, error = "starting", False, None
424
+
425
+ try:
426
+ with open(_LOG_FILE, "r", encoding="utf-8") as f:
427
+ lines = f.readlines()
428
+ log_text = "".join(lines[-120:])
429
+ except Exception:
430
+ log_text = ""
431
+
432
+ if done:
433
  label = "βœ… Training complete β€” model pushed to HF Hub"
434
+ elif error:
435
+ label = f"❌ Error: {error}"
436
  else:
437
+ icons = {"starting": "⏳", "training": "πŸ”„", "saving": "πŸ’Ύ", "uploading": "πŸ“€"}
 
 
 
438
  label = f"{icons.get(phase, 'πŸ”„')} {phase.capitalize()}..."
439
+ return label, log_text
440
 
441
 
442
  CSS = """