Spaces:
Paused
Paused
Commit Β·
52e0424
1
Parent(s): 0c2a6e7
Fix logs not showing: use file-based state instead of in-memory globals
Browse filesGradio 5 SSR mode runs event handlers in separate worker processes.
Those workers have their own empty copies of _logs/_status and never
see updates from the training thread in the main process.
Fix: _log() already writes to a file. Now _write_status() persists
phase/done/error to training_status.json. _get_state() reads both
files so it works regardless of which process handles the request.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -24,18 +24,26 @@ import threading
|
|
| 24 |
import json, time
|
| 25 |
import numpy as np
|
| 26 |
|
| 27 |
-
# ββ
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def _log(msg: str):
|
| 34 |
ts = time.strftime("%H:%M:%S")
|
| 35 |
line = f"[{ts}] {msg}"
|
| 36 |
-
_logs.append(line)
|
| 37 |
print(line, flush=True)
|
| 38 |
try:
|
|
|
|
| 39 |
with open(_LOG_FILE, "a", encoding="utf-8") as f:
|
| 40 |
f.write(line + "\n")
|
| 41 |
except Exception:
|
|
@@ -253,7 +261,7 @@ def _training_thread():
|
|
| 253 |
total_steps = int(cfg.get("training", {}).get("total_timesteps", 500_000))
|
| 254 |
_log(f"Total steps : {total_steps:,}")
|
| 255 |
_log("Training started...\n")
|
| 256 |
-
|
| 257 |
|
| 258 |
reward_logger = RewardLogger(curriculum=curriculum)
|
| 259 |
checkpoint_cb = CheckpointCallback(
|
|
@@ -283,7 +291,7 @@ def _training_thread():
|
|
| 283 |
_log(f"Final curriculum: {curriculum.progress_str()}")
|
| 284 |
|
| 285 |
# ββ Reward curve ββββββββββββββββββββββββββββββββββββ
|
| 286 |
-
|
| 287 |
ep_rewards = reward_logger.episode_rewards or [0.0]
|
| 288 |
episodes = list(range(len(ep_rewards)))
|
| 289 |
window = max(50, len(ep_rewards) // 20)
|
|
@@ -320,7 +328,7 @@ def _training_thread():
|
|
| 320 |
_log("Reward curve saved.")
|
| 321 |
|
| 322 |
# ββ Push everything to HF Hub ββββββββββββββββββββββββ
|
| 323 |
-
|
| 324 |
_log(f"Pushing to https://huggingface.co/{HF_REPO} ...")
|
| 325 |
|
| 326 |
ep = reward_logger.episode_rewards
|
|
@@ -390,15 +398,13 @@ model = RecurrentPPO.load(hf_hub_download("{HF_REPO}", "spindleflow_model.zip"))
|
|
| 390 |
|
| 391 |
_log(f"Uploaded {len(ops)} files.")
|
| 392 |
_log(f"Model live at: https://huggingface.co/{HF_REPO}")
|
| 393 |
-
|
| 394 |
-
_status["phase"] = "complete"
|
| 395 |
|
| 396 |
except Exception as exc:
|
| 397 |
import traceback
|
| 398 |
_log(f"ERROR: {exc}")
|
| 399 |
_log(traceback.format_exc())
|
| 400 |
-
|
| 401 |
-
_status["phase"] = "error"
|
| 402 |
|
| 403 |
|
| 404 |
# ββ Start training immediately on Space boot ββββββββββββββββββ
|
|
@@ -408,18 +414,29 @@ _thread.start()
|
|
| 408 |
|
| 409 |
# ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 410 |
def _get_state():
|
| 411 |
-
|
| 412 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
label = "β
Training complete β model pushed to HF Hub"
|
| 414 |
-
elif
|
| 415 |
-
label = f"β Error: {
|
| 416 |
else:
|
| 417 |
-
icons = {
|
| 418 |
-
"starting": "β³", "training": "π",
|
| 419 |
-
"saving": "πΎ", "uploading": "π€",
|
| 420 |
-
}
|
| 421 |
label = f"{icons.get(phase, 'π')} {phase.capitalize()}..."
|
| 422 |
-
return label,
|
| 423 |
|
| 424 |
|
| 425 |
CSS = """
|
|
|
|
| 24 |
import json, time
|
| 25 |
import numpy as np
|
| 26 |
|
| 27 |
+
# ββ Persistent state (file-based so Gradio worker processes can read it) βββββ
|
| 28 |
+
_LOG_FILE = "/home/user/app/assets/training_log.txt"
|
| 29 |
+
_STATUS_FILE = "/home/user/app/assets/training_status.json"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _write_status(phase, done=False, error=None):
|
| 33 |
+
try:
|
| 34 |
+
os.makedirs("/home/user/app/assets", exist_ok=True)
|
| 35 |
+
with open(_STATUS_FILE, "w", encoding="utf-8") as f:
|
| 36 |
+
json.dump({"phase": phase, "done": done, "error": error}, f)
|
| 37 |
+
except Exception:
|
| 38 |
+
pass
|
| 39 |
|
| 40 |
|
| 41 |
def _log(msg: str):
|
| 42 |
ts = time.strftime("%H:%M:%S")
|
| 43 |
line = f"[{ts}] {msg}"
|
|
|
|
| 44 |
print(line, flush=True)
|
| 45 |
try:
|
| 46 |
+
os.makedirs("/home/user/app/assets", exist_ok=True)
|
| 47 |
with open(_LOG_FILE, "a", encoding="utf-8") as f:
|
| 48 |
f.write(line + "\n")
|
| 49 |
except Exception:
|
|
|
|
| 261 |
total_steps = int(cfg.get("training", {}).get("total_timesteps", 500_000))
|
| 262 |
_log(f"Total steps : {total_steps:,}")
|
| 263 |
_log("Training started...\n")
|
| 264 |
+
_write_status("training")
|
| 265 |
|
| 266 |
reward_logger = RewardLogger(curriculum=curriculum)
|
| 267 |
checkpoint_cb = CheckpointCallback(
|
|
|
|
| 291 |
_log(f"Final curriculum: {curriculum.progress_str()}")
|
| 292 |
|
| 293 |
# ββ Reward curve ββββββββββββββββββββββββββββββββββββ
|
| 294 |
+
_write_status("saving")
|
| 295 |
ep_rewards = reward_logger.episode_rewards or [0.0]
|
| 296 |
episodes = list(range(len(ep_rewards)))
|
| 297 |
window = max(50, len(ep_rewards) // 20)
|
|
|
|
| 328 |
_log("Reward curve saved.")
|
| 329 |
|
| 330 |
# ββ Push everything to HF Hub ββββββββββββββββββββββββ
|
| 331 |
+
_write_status("uploading")
|
| 332 |
_log(f"Pushing to https://huggingface.co/{HF_REPO} ...")
|
| 333 |
|
| 334 |
ep = reward_logger.episode_rewards
|
|
|
|
| 398 |
|
| 399 |
_log(f"Uploaded {len(ops)} files.")
|
| 400 |
_log(f"Model live at: https://huggingface.co/{HF_REPO}")
|
| 401 |
+
_write_status("complete", done=True)
|
|
|
|
| 402 |
|
| 403 |
except Exception as exc:
|
| 404 |
import traceback
|
| 405 |
_log(f"ERROR: {exc}")
|
| 406 |
_log(traceback.format_exc())
|
| 407 |
+
_write_status("error", error=str(exc))
|
|
|
|
| 408 |
|
| 409 |
|
| 410 |
# ββ Start training immediately on Space boot ββββββββββββββββββ
|
|
|
|
| 414 |
|
| 415 |
# ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 416 |
def _get_state():
|
| 417 |
+
# Read from files β works across Gradio worker processes
|
| 418 |
+
try:
|
| 419 |
+
with open(_STATUS_FILE, "r", encoding="utf-8") as f:
|
| 420 |
+
s = json.load(f)
|
| 421 |
+
phase, done, error = s.get("phase", "starting"), s.get("done", False), s.get("error")
|
| 422 |
+
except Exception:
|
| 423 |
+
phase, done, error = "starting", False, None
|
| 424 |
+
|
| 425 |
+
try:
|
| 426 |
+
with open(_LOG_FILE, "r", encoding="utf-8") as f:
|
| 427 |
+
lines = f.readlines()
|
| 428 |
+
log_text = "".join(lines[-120:])
|
| 429 |
+
except Exception:
|
| 430 |
+
log_text = ""
|
| 431 |
+
|
| 432 |
+
if done:
|
| 433 |
label = "β
Training complete β model pushed to HF Hub"
|
| 434 |
+
elif error:
|
| 435 |
+
label = f"β Error: {error}"
|
| 436 |
else:
|
| 437 |
+
icons = {"starting": "β³", "training": "π", "saving": "πΎ", "uploading": "π€"}
|
|
|
|
|
|
|
|
|
|
| 438 |
label = f"{icons.get(phase, 'π')} {phase.capitalize()}..."
|
| 439 |
+
return label, log_text
|
| 440 |
|
| 441 |
|
| 442 |
CSS = """
|