Spaces:

openenv-community
/

test-local-nested-envs

Running on T4

Claude commited on 3 days ago

Commit

f703ff1

unverified ·

1 Parent(s): 46bfd81

Add volume verification, fsync, and stdout fallback for training outputs

- Verify volume is mounted and writable at startup (canary file) before
expensive training begins — fails fast with clear error message
- Add fsync after all critical file writes (logs, JSON, report) to ensure
data is flushed to the volume before container termination
- Print full report to stdout after saving so it's always visible in logs
- Save training JSON incrementally after each step (not just at the end)

https://claude.ai/code/session_01DPirJ78YYN4fJUvUFJ5D6V

Files changed (2) hide show

layer1/train.py +43 -0
layer1/training_logger.py +11 -0

layer1/train.py CHANGED Viewed

@@ -22,6 +22,7 @@ import json
 import logging
 import sys
 import os
 # Auto-load .env for HF_TOKEN
 from dotenv import load_dotenv
@@ -40,6 +41,29 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(message)s
 logger = logging.getLogger(__name__)
 def load_evaluator(
     hf_token: str | None = None,
     gen_cfg: dict | None = None,
@@ -117,6 +141,13 @@ def _print_config_banner(config: GRPOConfig, report_cfg: dict, paths_cfg: dict):
 def run_train(config: GRPOConfig, report_cfg: dict, paths_cfg: dict, hf_token: str | None, gen_cfg: dict | None = None, personas_cfg: dict | None = None):
     """Run GRPO training."""
     _print_config_banner(config, report_cfg, paths_cfg)
     evaluator = load_evaluator(hf_token, gen_cfg=gen_cfg, personas_cfg=personas_cfg)
     training_logger = TrainingLogger(
         log_dir=paths_cfg["log_dir"], total_steps=config.num_training_steps
@@ -149,6 +180,18 @@ def run_train(config: GRPOConfig, report_cfg: dict, paths_cfg: dict, hf_token: s
         )
         print(f"\nReport saved to {report_path}")
 def run_eval(hf_token: str | None, prompt: str, episodes: int):
     """Evaluate a single prompt."""

 import logging
 import sys
 import os
+from datetime import datetime
 # Auto-load .env for HF_TOKEN
 from dotenv import load_dotenv
 logger = logging.getLogger(__name__)
+def verify_volume_mount(paths_cfg: dict) -> None:
+    """Write a canary file at startup to verify the volume is mounted and writable."""
+    output_dirs = [
+        paths_cfg.get("output_dir", ""),
+        paths_cfg.get("log_dir", ""),
+    ]
+    for d in output_dirs:
+        if not d:
+            continue
+        os.makedirs(d, exist_ok=True)
+        canary = os.path.join(d, ".volume_check")
+        try:
+            with open(canary, "w") as f:
+                f.write(f"volume check {datetime.now().isoformat()}\n")
+                f.flush()
+                os.fsync(f.fileno())
+            logger.info("Volume check OK: %s", d)
+        except OSError as e:
+            logger.error("VOLUME WRITE FAILED for %s: %s", d, e)
+            print(f"\n*** WARNING: Cannot write to {d} — volume may not be mounted! ***\n")
+            raise
 def load_evaluator(
     hf_token: str | None = None,
     gen_cfg: dict | None = None,
 def run_train(config: GRPOConfig, report_cfg: dict, paths_cfg: dict, hf_token: str | None, gen_cfg: dict | None = None, personas_cfg: dict | None = None):
     """Run GRPO training."""
     _print_config_banner(config, report_cfg, paths_cfg)
+    # Verify volume is mounted before doing any expensive work
+    all_paths = dict(paths_cfg)
+    if report_cfg.get("enabled") and report_cfg.get("output_dir"):
+        all_paths["report_dir"] = report_cfg["output_dir"]
+    verify_volume_mount(all_paths)
     evaluator = load_evaluator(hf_token, gen_cfg=gen_cfg, personas_cfg=personas_cfg)
     training_logger = TrainingLogger(
         log_dir=paths_cfg["log_dir"], total_steps=config.num_training_steps
         )
         print(f"\nReport saved to {report_path}")
+        # Print report to stdout as fallback (always visible in logs)
+        try:
+            with open(report_path, "r") as f:
+                report_content = f.read()
+            print(f"\n{'='*60}")
+            print("REPORT CONTENT (stdout fallback)")
+            print(f"{'='*60}")
+            print(report_content)
+            print(f"{'='*60}")
+        except OSError:
+            print("WARNING: Could not re-read report from disk")
 def run_eval(hf_token: str | None, prompt: str, episodes: int):
     """Evaluate a single prompt."""

layer1/training_logger.py CHANGED Viewed

@@ -36,6 +36,8 @@ class TrainingLogger:
         with open(self.log_path, "w") as f:
             f.write(f"Training Log — {self._start_time.isoformat()}\n")
             f.write(f"{'=' * 60}\n\n")
     def log_iteration(self, step: int, prompt: str, eval_result: dict[str, Any]):
         """Log a single training iteration (one prompt evaluated)."""
@@ -59,6 +61,11 @@ class TrainingLogger:
             f.write(f"Min/Max: {entry['min_reward']:.1f} / {entry['max_reward']:.1f}\n")
             f.write(f"Episodes: {entry['num_episodes']}\n")
             f.write(f"---\n\n")
         logger.info("Logged step %d: mean_reward=%.1f", step, entry["mean_reward"])
@@ -76,6 +83,8 @@ class TrainingLogger:
         }
         with open(self.json_path, "w") as f:
             json.dump(data, f, indent=2, default=str)
         logger.info("Training data saved to %s", self.json_path)
     def get_checkpoint_indices(self) -> list[int]:
@@ -418,3 +427,5 @@ class ReportGenerator:
         with open(report_path, "w") as f:
             f.write("\n".join(lines))

         with open(self.log_path, "w") as f:
             f.write(f"Training Log — {self._start_time.isoformat()}\n")
             f.write(f"{'=' * 60}\n\n")
+            f.flush()
+            os.fsync(f.fileno())
     def log_iteration(self, step: int, prompt: str, eval_result: dict[str, Any]):
         """Log a single training iteration (one prompt evaluated)."""
             f.write(f"Min/Max: {entry['min_reward']:.1f} / {entry['max_reward']:.1f}\n")
             f.write(f"Episodes: {entry['num_episodes']}\n")
             f.write(f"---\n\n")
+            f.flush()
+            os.fsync(f.fileno())
+        # Incremental save — persist JSON after every step so data survives crashes
+        self.save_json()
         logger.info("Logged step %d: mean_reward=%.1f", step, entry["mean_reward"])
         }
         with open(self.json_path, "w") as f:
             json.dump(data, f, indent=2, default=str)
+            f.flush()
+            os.fsync(f.fileno())
         logger.info("Training data saved to %s", self.json_path)
     def get_checkpoint_indices(self) -> list[int]:
         with open(report_path, "w") as f:
             f.write("\n".join(lines))
+            f.flush()
+            os.fsync(f.fileno())