Spaces:

dippoo
/

content-engine

Running

dippoo Claude Opus 4.6 commited on 25 days ago

Commit

b4ceba9

1 Parent(s): 3aa914c

Run training in detached nohup process (survives SSH/server disconnect)

Training now runs via nohup with output to /tmp/training.log.
Monitor reads log file instead of SSH channel stdout.
Safe against HF Space rebuilds, SSH drops, and browser closes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/content_engine/services/runpod_trainer.py +49 -42

src/content_engine/services/runpod_trainer.py CHANGED Viewed

@@ -589,50 +589,57 @@ resolution = [{resolution}, {resolution}]
                 gpu_type=job.gpu_type,
             )
-            # Execute training and stream output
-            job._log("Training command submitted...")
-            transport = ssh.get_transport()
-            channel = transport.open_session()
-            channel.exec_command(train_cmd)
-            # Read output progressively
-            buffer = ""
-            last_flush = time.time()
-            while not channel.exit_status_ready() or channel.recv_ready():
-                if channel.recv_ready():
-                    chunk = (await asyncio.to_thread(channel.recv, 4096)).decode("utf-8", errors="replace")
-                    buffer += chunk
-                    # Process complete lines (handle both \n and \r for tqdm progress)
-                    while "\n" in buffer or "\r" in buffer:
-                        # Split on whichever comes first
-                        n_pos = buffer.find("\n")
-                        r_pos = buffer.find("\r")
-                        if n_pos == -1:
-                            split_pos = r_pos
-                        elif r_pos == -1:
-                            split_pos = n_pos
-                        else:
-                            split_pos = min(n_pos, r_pos)
-                        line = buffer[:split_pos].strip()
-                        buffer = buffer[split_pos + 1:]
-                        if not line:
-                            continue
-                        job._log(line)
-                        self._parse_progress(job, line)
-                        self._schedule_db_save(job)
-                else:
-                    # Periodically flush buffer for partial tqdm lines
-                    if buffer.strip() and time.time() - last_flush > 10:
-                        job._log(buffer.strip())
-                        self._parse_progress(job, buffer.strip())
-                        buffer = ""
-                        last_flush = time.time()
                         self._schedule_db_save(job)
-                    await asyncio.sleep(2)
-            exit_code = channel.recv_exit_status()
-            if exit_code != 0:
-                raise RuntimeError(f"Training failed with exit code {exit_code}")
             job._log("Training completed on RunPod!")
             job.progress = 0.9

                 gpu_type=job.gpu_type,
             )
+            # Execute training in a detached process (survives SSH disconnect)
+            job._log("Starting training (detached — survives disconnects)...")
+            log_file = "/tmp/training.log"
+            pid_file = "/tmp/training.pid"
+            exit_file = "/tmp/training.exit"
+            await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
+            # Run in background with nohup, redirect output to log, save PID and exit code
+            detached_cmd = (
+                f"nohup bash -c '{train_cmd} > {log_file} 2>&1; echo $? > {exit_file}' &\n"
+                f"echo $! > {pid_file}"
+            )
+            await self._ssh_exec(ssh, detached_cmd, timeout=10)
+            await asyncio.sleep(2)
+            pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
+            job._log(f"Training PID: {pid}")
+            # Monitor the log file (reconnect-safe)
+            last_offset = 0
+            while True:
+                # Check if training finished
+                exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
+                if exit_check:
+                    exit_code = int(exit_check)
+                    # Read remaining log
+                    remaining = (await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30))
+                    if remaining:
+                        for line in remaining.split("\n"):
+                            line = line.strip()
+                            if line:
+                                job._log(line)
+                                self._parse_progress(job, line)
+                    if exit_code != 0:
+                        raise RuntimeError(f"Training failed with exit code {exit_code}")
+                    break
+                # Read new log output
+                try:
+                    new_output = (await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30))
+                    if new_output:
+                        last_offset += len(new_output.encode("utf-8"))
+                        for line in new_output.replace("\r", "\n").split("\n"):
+                            line = line.strip()
+                            if not line:
+                                continue
+                            job._log(line)
+                            self._parse_progress(job, line)
                         self._schedule_db_save(job)
+                except Exception:
+                    job._log("Log read failed, retrying...")
+                await asyncio.sleep(5)
             job._log("Training completed on RunPod!")
             job.progress = 0.9