Spaces:

zino36
/

lerobot-pusht-trainer

Sleeping

App Files Files Community

zino36 commited on Sep 29

Commit

3b83a57

verified ·

1 Parent(s): dc206dc

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -24

app.py CHANGED Viewed

@@ -145,42 +145,67 @@ def start_training(steps, batch_size, push_to_hub, repo_id):
     return msg, run_dir, tail_file(log)
 def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
-    # Auto-pick newest run with a checkpoint when left blank.
     run_dir = current_run_dir(run_dir_text)
     if not run_dir:
-        return "No run found on disk. Start a fresh training first (let it pass step 500 to create a checkpoint).", "", "(no log)"
     log = train_log_path(run_dir)
     if not has_checkpoint(run_dir):
-        return f"Selected run: {run_dir}\nNo checkpoint in {run_dir}/checkpoints/last/ yet — run at least 500 steps once.", run_dir, tail_file(log)
-    # Build push flags
-    push_flags = (f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
-                  if push_to_hub and repo_id.strip() else
-                  "--policy.push_to_hub=false")
-    # Prefer resuming with the exact saved config from this run
     cfg_path = os.path.join(run_dir, "train_config.json")
-    base = (
-        "lerobot-train "
-        f"--output_dir='{run_dir}' "
-        "--resume=true "
-        f"--config_path='{cfg_path}' "
-        f"--steps={extra_steps} "
-        "--eval_freq=500 "
-        "--save_freq=500 "
-        f"{push_flags}"
-    )
     if os.path.exists(cfg_path):
-        base += f"--config_path='{cfg_path}' "
     else:
-        base += "--policy.type=diffusion --dataset.repo_id=lerobot/pusht --env.type=pusht "
-    cmd = base + push_flags
     rc, tail = _run(cmd, log)
     msg = f"Resumed run at: {run_dir}\nResume exited rc={rc}\n\n=== train.log tail ===\n{tail}"
     return msg, run_dir, tail_file(log)
 def eval_latest(run_dir_text):
     run_dir = current_run_dir(run_dir_text)
     if not run_dir:

     return msg, run_dir, tail_file(log)
 def resume_training(extra_steps, push_to_hub, repo_id, run_dir_text):
+    """
+    Resume training from the newest run (with a checkpoint) if run_dir_text is blank.
+    Uses the exact saved train_config.json when available; otherwise falls back to
+    minimal CLI schema so the parser knows the policy/dataset/env types.
+    """
+    # 1) Resolve which run to resume
     run_dir = current_run_dir(run_dir_text)
     if not run_dir:
+        return (
+            "No run found on disk. Start a fresh training first (let it pass step 500 to create a checkpoint).",
+            "",
+            "(no log)",
+        )
     log = train_log_path(run_dir)
+    # 2) Ensure a checkpoint exists
     if not has_checkpoint(run_dir):
+        return (
+            f"Selected run: {run_dir}\nNo checkpoint in {run_dir}/checkpoints/last/ yet — run at least 500 steps once.",
+            run_dir,
+            tail_file(log),
+        )
+    # 3) Push flags (optional)
+    push_flags = (
+        f"--policy.push_to_hub=true --policy.repo_id='{repo_id.strip()}'"
+        if (push_to_hub and repo_id and repo_id.strip())
+        else "--policy.push_to_hub=false"
+    )
+    # 4) Prefer resuming with the saved config (LOCAL file -> use --config-file)
     cfg_path = os.path.join(run_dir, "train_config.json")
+    parts = [
+        "lerobot-train",
+        f"--output_dir='{run_dir}'",
+        "--resume=true",
+        f"--steps={extra_steps}",
+        "--eval_freq=500",
+        "--save_freq=500",
+    ]
     if os.path.exists(cfg_path):
+        # IMPORTANT: use --config-file for a local JSON, not --config_path
+        parts.insert(3, f"--config-file='{cfg_path}'")
     else:
+        # Fallback minimal schema so the CLI can parse types correctly
+        parts.extend([
+            "--policy.type=diffusion",
+            "--dataset.repo_id=lerobot/pusht",
+            "--env.type=pusht",
+        ])
+    parts.append(push_flags)
+    cmd = " ".join(parts)
+    # 5) Run and return logs
     rc, tail = _run(cmd, log)
     msg = f"Resumed run at: {run_dir}\nResume exited rc={rc}\n\n=== train.log tail ===\n{tail}"
     return msg, run_dir, tail_file(log)
 def eval_latest(run_dir_text):
     run_dir = current_run_dir(run_dir_text)
     if not run_dir: