dippoo commited on
Commit
f072956
·
1 Parent(s): 1dc4764

Fix detached training launch: use setsid, close FDs, add debug logging

Browse files
src/content_engine/services/runpod_trainer.py CHANGED
@@ -594,17 +594,43 @@ resolution = [{resolution}, {resolution}]
594
  log_file = "/tmp/training.log"
595
  pid_file = "/tmp/training.pid"
596
  exit_file = "/tmp/training.exit"
597
- await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
 
598
  # Write training command to a script file (avoids quoting issues with nohup)
599
  script_file = "/tmp/train.sh"
600
- # Escape the command for heredoc
601
  await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
602
  await self._ssh_exec(ssh, f"chmod +x {script_file}")
603
- await self._ssh_exec(ssh, f"nohup {script_file} &\necho $! > {pid_file}", timeout=10)
604
- await asyncio.sleep(2)
 
 
 
 
 
 
 
 
 
 
605
  pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
 
 
 
606
  job._log(f"Training PID: {pid}")
607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
  # Monitor the log file (reconnect-safe)
609
  last_offset = 0
610
  while True:
 
594
  log_file = "/tmp/training.log"
595
  pid_file = "/tmp/training.pid"
596
  exit_file = "/tmp/training.exit"
597
+ await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file} {pid_file}")
598
+
599
  # Write training command to a script file (avoids quoting issues with nohup)
600
  script_file = "/tmp/train.sh"
 
601
  await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
602
  await self._ssh_exec(ssh, f"chmod +x {script_file}")
603
+
604
+ # Verify script was written
605
+ script_check = (await self._ssh_exec(ssh, f"wc -l < {script_file}")).strip()
606
+ job._log(f"Training script written ({script_check} lines)")
607
+
608
+ # Launch fully detached: close all FDs so SSH channel doesn't hang
609
+ await self._ssh_exec(
610
+ ssh,
611
+ f"setsid {script_file} </dev/null >/dev/null 2>&1 &\necho $! > {pid_file}",
612
+ timeout=15,
613
+ )
614
+ await asyncio.sleep(3)
615
  pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
616
+ if not pid:
617
+ # Fallback: find the process by script name
618
+ pid = (await self._ssh_exec(ssh, "pgrep -f train.sh 2>/dev/null | head -1")).strip()
619
  job._log(f"Training PID: {pid}")
620
 
621
+ # Verify process is actually running
622
+ if pid:
623
+ running = (await self._ssh_exec(ssh, f"kill -0 {pid} 2>&1 && echo RUNNING || echo DEAD")).strip()
624
+ job._log(f"Process status: {running}")
625
+ if "DEAD" in running:
626
+ # Check if it already wrote an exit code (fast failure)
627
+ early_exit = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
628
+ early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
629
+ raise RuntimeError(f"Training process died immediately. Exit: {early_exit}\nLog: {early_log}")
630
+ else:
631
+ early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
632
+ raise RuntimeError(f"Failed to start training process.\nLog: {early_log}")
633
+
634
  # Monitor the log file (reconnect-safe)
635
  last_offset = 0
636
  while True: