dippoo Claude Opus 4.6 commited on
Commit
85988f2
·
1 Parent(s): ce5bf6b

Fix nohup quoting: write train command to script file first

Browse files

The train_cmd contains single quotes (optimizer args) which broke
nohup bash -c '...'. Now writes to /tmp/train.sh via heredoc then
runs nohup on the script file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

src/content_engine/services/runpod_trainer.py CHANGED
@@ -595,12 +595,12 @@ resolution = [{resolution}, {resolution}]
595
  pid_file = "/tmp/training.pid"
596
  exit_file = "/tmp/training.exit"
597
  await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
598
- # Run in background with nohup, redirect output to log, save PID and exit code
599
- detached_cmd = (
600
- f"nohup bash -c '{train_cmd} > {log_file} 2>&1; echo $? > {exit_file}' &\n"
601
- f"echo $! > {pid_file}"
602
- )
603
- await self._ssh_exec(ssh, detached_cmd, timeout=10)
604
  await asyncio.sleep(2)
605
  pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
606
  job._log(f"Training PID: {pid}")
 
595
  pid_file = "/tmp/training.pid"
596
  exit_file = "/tmp/training.exit"
597
  await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file}")
598
+ # Write training command to a script file (avoids quoting issues with nohup)
599
+ script_file = "/tmp/train.sh"
600
+ # Escape the command for heredoc
601
+ await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
602
+ await self._ssh_exec(ssh, f"chmod +x {script_file}")
603
+ await self._ssh_exec(ssh, f"nohup {script_file} &\necho $! > {pid_file}", timeout=10)
604
  await asyncio.sleep(2)
605
  pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
606
  job._log(f"Training PID: {pid}")