Spaces:
Running
Running
Fix nohup quoting: write train command to script file first
Browse filesThe train_cmd contains single quotes (optimizer args) which broke
nohup bash -c '...'. Now writes to /tmp/train.sh via heredoc then
runs nohup on the script file.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
src/content_engine/services/runpod_trainer.py
CHANGED
|
@@ -595,12 +595,12 @@ resolution = [{resolution}, {resolution}]
|
|
| 595 |
pid_file = "/tmp/training.pid"
|
| 596 |
exit_file = "/tmp/training.exit"
|
| 597 |
await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
|
| 598 |
-
#
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
)
|
| 603 |
-
await self._ssh_exec(ssh,
|
| 604 |
await asyncio.sleep(2)
|
| 605 |
pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
|
| 606 |
job._log(f"Training PID: {pid}")
|
|
|
|
| 595 |
pid_file = "/tmp/training.pid"
|
| 596 |
exit_file = "/tmp/training.exit"
|
| 597 |
await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
|
| 598 |
+
# Write training command to a script file (avoids quoting issues with nohup)
|
| 599 |
+
script_file = "/tmp/train.sh"
|
| 600 |
+
# Escape the command for heredoc
|
| 601 |
+
await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
|
| 602 |
+
await self._ssh_exec(ssh, f"chmod +x {script_file}")
|
| 603 |
+
await self._ssh_exec(ssh, f"nohup {script_file} &\necho $! > {pid_file}", timeout=10)
|
| 604 |
await asyncio.sleep(2)
|
| 605 |
pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
|
| 606 |
job._log(f"Training PID: {pid}")
|