Spaces:

zino36
/

lerobot-pusht-trainer

Sleeping

App Files Files Community

zino36 commited on Sep 29

Commit

ad5a998

verified ·

1 Parent(s): 6a460fa

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -27

app.py CHANGED Viewed

@@ -2,14 +2,14 @@ import os, subprocess, json, pathlib, time, shutil
 import gradio as gr
 # ---------- CONSTANTS (visible in App Files) ----------
-RUN_ROOT = "/home/user/app/runs"          # where all runs live
-LOG_ROOT = "/home/user/app/logs"          # global logs (so we don't pre-create run dirs)
 LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST" # remembers most recent run path
 os.makedirs(RUN_ROOT, exist_ok=True)
 os.makedirs(LOG_ROOT, exist_ok=True)
 # ---------- ENV / HUB ----------
-DEFAULT_REPO_ID = os.environ.get("REPO_ID", "")       # e.g. "zino36/lerobot-pusht-colab"
 PUSH_DEFAULT    = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
 HF_TOKEN        = os.environ.get("HF_TOKEN")
@@ -43,8 +43,12 @@ def tail_file(path: str, n=200):
     return "".join(lines[-n:])
 # ---------- RUN DIR HELPERS ----------
 def newest_run(prefer_checkpoint: bool = True) -> str:
-    """Return the newest pusht_* folder. If prefer_checkpoint=True, pick the newest that has checkpoints/last/."""
     root = pathlib.Path(RUN_ROOT)
     if not root.exists():
         return ""
@@ -56,9 +60,9 @@ def newest_run(prefer_checkpoint: bool = True) -> str:
             if has_checkpoint(str(r)):
                 return str(r)
     return str(runs[0])
 def new_run_dir():
-    """Return a unique run dir path WITHOUT creating it (so LeRobot can create it)."""
     base = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
     d = base
     i = 1
@@ -72,15 +76,14 @@ def current_run_dir(user_override: str | None):
     """
     Resolve which run to use:
     - If user typed something, accept folder name or full path.
-    - Else try LAST pointer.
-    - Else pick newest run with a checkpoint.
-    - Else pick newest run (even without checkpoint).
     - Else return "" (none).
     """
     # A) explicit user input
     if user_override and user_override.strip():
         p = user_override.strip()
-        # allow just "pusht_123..." as well as absolute path
         if not p.startswith("/"):
             p = str(pathlib.Path(RUN_ROOT) / p)
         return p
@@ -103,16 +106,10 @@ def current_run_dir(user_override: str | None):
         LAST_PTR.write_text(p)
         return p
-    # E) nothing found
     return ""
-def has_checkpoint(run_dir: str):
-    """We consider a checkpoint present once checkpoints/last/ exists (first save is at step 500)."""
-    return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
 def train_log_path_for_new(run_dir: str):
-    """Write fresh-run logs to global LOG_ROOT so we don't pre-create run_dir."""
     name = pathlib.Path(run_dir).name
     return os.path.join(LOG_ROOT, f"{name}.train.log")
@@ -148,6 +145,7 @@ def start_training(steps, batch_size, push_to_hub, repo_id):
     return msg, run_dir, tail_file(log)
 def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
     run_dir = current_run_dir(run_dir_text)
     if not run_dir:
         return "No run found on disk. Start a fresh training first (let it pass step 500 to create a checkpoint).", "", "(no log)"
@@ -155,20 +153,29 @@ def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
     if not has_checkpoint(run_dir):
         return f"Selected run: {run_dir}\nNo checkpoint in {run_dir}/checkpoints/last/ yet — run at least 500 steps once.", run_dir, tail_file(log)
     push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
                   if push_to_hub and repo_id.strip() else
                   "--policy.push_to_hub=false")
-    cmd = (
         "lerobot-train "
         f"--output_dir='{run_dir}' "
         "--resume=true "
         f"--steps={extra_steps} "
         "--eval_freq=500 "
         "--save_freq=500 "
-        f"{push_flags}"
     )
     rc, tail = _run(cmd, log)
     msg = f"Resumed run at: {run_dir}\nResume exited rc={rc}\n\n=== train.log tail ===\n{tail}"
     return msg, run_dir, tail_file(log)
@@ -196,12 +203,10 @@ def eval_latest(run_dir_text):
     )
     rc, tail = _run(cmd, elog)
-    # --- optional patch: parse the printed dict and write metrics.json ---
-    import re, ast, json, pathlib
     metrics_txt = "(metrics.json not found)"
     p = pathlib.Path(eval_out_dir) / "metrics.json"
-    # Try to parse the last dict-like summary from the log tail
     m = re.findall(r"\{[^}]+pc_success[^}]+\}", tail, flags=re.S)
     if m:
         try:
@@ -217,7 +222,6 @@ def eval_latest(run_dir_text):
         except Exception:
             pass
     elif p.exists():
-        # Fallback: if a previous metrics.json exists, show it
         try:
             d = json.loads(p.read_text())
             metrics_txt = f"Success rate: {d.get('success_rate')}\nAvg max overlap: {d.get('avg_max_overlap')}"
@@ -227,7 +231,7 @@ def eval_latest(run_dir_text):
     msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
     return msg, run_dir, tail_file(elog), metrics_txt
 # ---------- Maintenance (list / delete runs) ----------
 def list_runs():
     root = pathlib.Path(RUN_ROOT)
@@ -255,7 +259,6 @@ def delete_run_by_name(name: str):
     if not os.path.isdir(target):
         return f"Folder not found: {target}", list_runs()
     shutil.rmtree(target, ignore_errors=True)
-    # clear LAST if it pointed here
     if LAST_PTR.exists() and LAST_PTR.read_text().strip() == target:
         LAST_PTR.unlink(missing_ok=True)
     return f"Deleted {target}", list_runs()

 import gradio as gr
 # ---------- CONSTANTS (visible in App Files) ----------
+RUN_ROOT = "/home/user/app/runs"           # where all runs live
+LOG_ROOT = "/home/user/app/logs"           # global logs (avoid pre-creating run dirs)
 LAST_PTR = pathlib.Path(RUN_ROOT) / "LAST" # remembers most recent run path
 os.makedirs(RUN_ROOT, exist_ok=True)
 os.makedirs(LOG_ROOT, exist_ok=True)
 # ---------- ENV / HUB ----------
+DEFAULT_REPO_ID = os.environ.get("REPO_ID", "")  # e.g. "zino36/lerobot-pusht-colab"
 PUSH_DEFAULT    = os.environ.get("PUSH_TO_HUB", "true").lower() in {"1","true","yes"}
 HF_TOKEN        = os.environ.get("HF_TOKEN")
     return "".join(lines[-n:])
 # ---------- RUN DIR HELPERS ----------
+def has_checkpoint(run_dir: str):
+    """Checkpoint considered present once checkpoints/last/ exists (first save ~ step 500)."""
+    return os.path.isdir(os.path.join(run_dir, "checkpoints", "last"))
 def newest_run(prefer_checkpoint: bool = True) -> str:
+    """Pick newest pusht_* folder; optionally require a checkpoint."""
     root = pathlib.Path(RUN_ROOT)
     if not root.exists():
         return ""
             if has_checkpoint(str(r)):
                 return str(r)
     return str(runs[0])
 def new_run_dir():
+    """Return a unique run dir path WITHOUT creating it (LeRobot will create it)."""
     base = pathlib.Path(RUN_ROOT) / f"pusht_{int(time.time())}"
     d = base
     i = 1
     """
     Resolve which run to use:
     - If user typed something, accept folder name or full path.
+    - Else try LAST pointer (if valid).
+    - Else newest run WITH checkpoint.
+    - Else newest run (even without checkpoint).
     - Else return "" (none).
     """
     # A) explicit user input
     if user_override and user_override.strip():
         p = user_override.strip()
         if not p.startswith("/"):
             p = str(pathlib.Path(RUN_ROOT) / p)
         return p
         LAST_PTR.write_text(p)
         return p
     return ""
 def train_log_path_for_new(run_dir: str):
+    """Fresh-run logs go to global LOG_ROOT so we don't pre-create run_dir."""
     name = pathlib.Path(run_dir).name
     return os.path.join(LOG_ROOT, f"{name}.train.log")
     return msg, run_dir, tail_file(log)
 def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
+    # Auto-pick newest run with a checkpoint when left blank.
     run_dir = current_run_dir(run_dir_text)
     if not run_dir:
         return "No run found on disk. Start a fresh training first (let it pass step 500 to create a checkpoint).", "", "(no log)"
     if not has_checkpoint(run_dir):
         return f"Selected run: {run_dir}\nNo checkpoint in {run_dir}/checkpoints/last/ yet — run at least 500 steps once.", run_dir, tail_file(log)
+    # Build push flags
     push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
                   if push_to_hub and repo_id.strip() else
                   "--policy.push_to_hub=false")
+    # Prefer resuming with the exact saved config from this run
+    cfg_path = os.path.join(run_dir, "train_config.json")
+    base = (
         "lerobot-train "
         f"--output_dir='{run_dir}' "
         "--resume=true "
         f"--steps={extra_steps} "
         "--eval_freq=500 "
         "--save_freq=500 "
     )
+    if os.path.exists(cfg_path):
+        base += f"--config_path='{cfg_path}' "
+    else:
+        # Fallback to minimal schema (prevents policy parsing errors)
+        base += "--policy.type=diffusion --dataset.repo_id=lerobot/pusht --env.type=pusht "
+    cmd = base + push_flags
     rc, tail = _run(cmd, log)
     msg = f"Resumed run at: {run_dir}\nResume exited rc={rc}\n\n=== train.log tail ===\n{tail}"
     return msg, run_dir, tail_file(log)
     )
     rc, tail = _run(cmd, elog)
+    # --- parse printed dict and write metrics.json if missing ---
+    import re, ast
     metrics_txt = "(metrics.json not found)"
     p = pathlib.Path(eval_out_dir) / "metrics.json"
     m = re.findall(r"\{[^}]+pc_success[^}]+\}", tail, flags=re.S)
     if m:
         try:
         except Exception:
             pass
     elif p.exists():
         try:
             d = json.loads(p.read_text())
             metrics_txt = f"Success rate: {d.get('success_rate')}\nAvg max overlap: {d.get('avg_max_overlap')}"
     msg = f"Evaluated run at: {run_dir}\nEval exited rc={rc}\n\n=== eval.log tail ===\n{tail}"
     return msg, run_dir, tail_file(elog), metrics_txt
 # ---------- Maintenance (list / delete runs) ----------
 def list_runs():
     root = pathlib.Path(RUN_ROOT)
     if not os.path.isdir(target):
         return f"Folder not found: {target}", list_runs()
     shutil.rmtree(target, ignore_errors=True)
     if LAST_PTR.exists() and LAST_PTR.read_text().strip() == target:
         LAST_PTR.unlink(missing_ok=True)
     return f"Deleted {target}", list_runs()