Spaces:

dippoo
/

content-engine

Running

App Files Files

dippoo commited on Feb 18

Commit

f072956

1 Parent(s): 1dc4764

Fix detached training launch: use setsid, close FDs, add debug logging

Browse files

Files changed (1) hide show

src/content_engine/services/runpod_trainer.py +30 -4

src/content_engine/services/runpod_trainer.py CHANGED Viewed

@@ -594,17 +594,43 @@ resolution = [{resolution}, {resolution}]
             log_file = "/tmp/training.log"
             pid_file = "/tmp/training.pid"
             exit_file = "/tmp/training.exit"
-            await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
             # Write training command to a script file (avoids quoting issues with nohup)
             script_file = "/tmp/train.sh"
-            # Escape the command for heredoc
             await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
             await self._ssh_exec(ssh, f"chmod +x {script_file}")
-            await self._ssh_exec(ssh, f"nohup {script_file} &\necho $! > {pid_file}", timeout=10)
-            await asyncio.sleep(2)
             pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
             job._log(f"Training PID: {pid}")
             # Monitor the log file (reconnect-safe)
             last_offset = 0
             while True:

             log_file = "/tmp/training.log"
             pid_file = "/tmp/training.pid"
             exit_file = "/tmp/training.exit"
+            await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file} {pid_file}")
             # Write training command to a script file (avoids quoting issues with nohup)
             script_file = "/tmp/train.sh"
             await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
             await self._ssh_exec(ssh, f"chmod +x {script_file}")
+            # Verify script was written
+            script_check = (await self._ssh_exec(ssh, f"wc -l < {script_file}")).strip()
+            job._log(f"Training script written ({script_check} lines)")
+            # Launch fully detached: close all FDs so SSH channel doesn't hang
+            await self._ssh_exec(
+                ssh,
+                f"setsid {script_file} </dev/null >/dev/null 2>&1 &\necho $! > {pid_file}",
+                timeout=15,
+            )
+            await asyncio.sleep(3)
             pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
+            if not pid:
+                # Fallback: find the process by script name
+                pid = (await self._ssh_exec(ssh, "pgrep -f train.sh 2>/dev/null | head -1")).strip()
             job._log(f"Training PID: {pid}")
+            # Verify process is actually running
+            if pid:
+                running = (await self._ssh_exec(ssh, f"kill -0 {pid} 2>&1 && echo RUNNING || echo DEAD")).strip()
+                job._log(f"Process status: {running}")
+                if "DEAD" in running:
+                    # Check if it already wrote an exit code (fast failure)
+                    early_exit = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
+                    early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
+                    raise RuntimeError(f"Training process died immediately. Exit: {early_exit}\nLog: {early_log}")
+            else:
+                early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
+                raise RuntimeError(f"Failed to start training process.\nLog: {early_log}")
             # Monitor the log file (reconnect-safe)
             last_offset = 0
             while True: