Spaces:
Running
Running
Fix detached training launch: use setsid, close FDs, add debug logging
Browse files
src/content_engine/services/runpod_trainer.py
CHANGED
|
@@ -594,17 +594,43 @@ resolution = [{resolution}, {resolution}]
|
|
| 594 |
log_file = "/tmp/training.log"
|
| 595 |
pid_file = "/tmp/training.pid"
|
| 596 |
exit_file = "/tmp/training.exit"
|
| 597 |
-
await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
|
|
|
|
| 598 |
# Write training command to a script file (avoids quoting issues with nohup)
|
| 599 |
script_file = "/tmp/train.sh"
|
| 600 |
-
# Escape the command for heredoc
|
| 601 |
await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
|
| 602 |
await self._ssh_exec(ssh, f"chmod +x {script_file}")
|
| 603 |
-
|
| 604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
|
|
|
|
|
|
|
|
|
|
| 606 |
job._log(f"Training PID: {pid}")
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
# Monitor the log file (reconnect-safe)
|
| 609 |
last_offset = 0
|
| 610 |
while True:
|
|
|
|
| 594 |
log_file = "/tmp/training.log"
|
| 595 |
pid_file = "/tmp/training.pid"
|
| 596 |
exit_file = "/tmp/training.exit"
|
| 597 |
+
await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file} {pid_file}")
|
| 598 |
+
|
| 599 |
# Write training command to a script file (avoids quoting issues with nohup)
|
| 600 |
script_file = "/tmp/train.sh"
|
|
|
|
| 601 |
await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
|
| 602 |
await self._ssh_exec(ssh, f"chmod +x {script_file}")
|
| 603 |
+
|
| 604 |
+
# Verify script was written
|
| 605 |
+
script_check = (await self._ssh_exec(ssh, f"wc -l < {script_file}")).strip()
|
| 606 |
+
job._log(f"Training script written ({script_check} lines)")
|
| 607 |
+
|
| 608 |
+
# Launch fully detached: close all FDs so SSH channel doesn't hang
|
| 609 |
+
await self._ssh_exec(
|
| 610 |
+
ssh,
|
| 611 |
+
f"setsid {script_file} </dev/null >/dev/null 2>&1 &\necho $! > {pid_file}",
|
| 612 |
+
timeout=15,
|
| 613 |
+
)
|
| 614 |
+
await asyncio.sleep(3)
|
| 615 |
pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
|
| 616 |
+
if not pid:
|
| 617 |
+
# Fallback: find the process by script name
|
| 618 |
+
pid = (await self._ssh_exec(ssh, "pgrep -f train.sh 2>/dev/null | head -1")).strip()
|
| 619 |
job._log(f"Training PID: {pid}")
|
| 620 |
|
| 621 |
+
# Verify process is actually running
|
| 622 |
+
if pid:
|
| 623 |
+
running = (await self._ssh_exec(ssh, f"kill -0 {pid} 2>&1 && echo RUNNING || echo DEAD")).strip()
|
| 624 |
+
job._log(f"Process status: {running}")
|
| 625 |
+
if "DEAD" in running:
|
| 626 |
+
# Check if it already wrote an exit code (fast failure)
|
| 627 |
+
early_exit = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
|
| 628 |
+
early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
|
| 629 |
+
raise RuntimeError(f"Training process died immediately. Exit: {early_exit}\nLog: {early_log}")
|
| 630 |
+
else:
|
| 631 |
+
early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
|
| 632 |
+
raise RuntimeError(f"Failed to start training process.\nLog: {early_log}")
|
| 633 |
+
|
| 634 |
# Monitor the log file (reconnect-safe)
|
| 635 |
last_offset = 0
|
| 636 |
while True:
|