Spaces:

dippoo
/

content-engine

Running

dippoo Claude Opus 4.6 commited on 26 days ago

Commit

1c46fa4

1 Parent(s): b4ceba9

Auto-reconnect to running training pods after server restart

On startup, checks DB for in-progress training jobs, verifies their
RunPod pods are still running, and reconnects SSH to resume log
monitoring. Handles both still-running and already-completed cases.
Copies LoRA to volume and terminates pod on completion.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/content_engine/services/runpod_trainer.py +157 -3

src/content_engine/services/runpod_trainer.py CHANGED Viewed

@@ -788,11 +788,27 @@ resolution = [{resolution}, {resolution}]
                             base_model=db_job.base_model or "sd15_realistic",
                             model_type=db_job.model_type or "sd15",
                         )
-                        # Mark interrupted jobs as failed
-                        if job.status not in ("completed", "failed"):
                             job.status = "failed"
                             job.error = "Interrupted by server restart"
-                        self._jobs[db_job.id] = job
         except Exception as e:
             logger.warning("Failed to load training jobs from DB: %s", e)
@@ -802,6 +818,144 @@ resolution = [{resolution}, {resolution}]
             self._loaded_from_db = True
             await self._load_jobs_from_db()
     def _build_training_command(
         self,
         *,

                             base_model=db_job.base_model or "sd15_realistic",
                             model_type=db_job.model_type or "sd15",
                         )
+                        self._jobs[db_job.id] = job
+                        # Try to reconnect to running training pods
+                        if job.status not in ("completed", "failed") and job.pod_id:
+                            try:
+                                pod = await asyncio.to_thread(runpod.get_pod, job.pod_id)
+                                if pod and pod.get("desiredStatus") == "RUNNING":
+                                    job.status = "training"
+                                    job.error = None
+                                    job._log("Reconnecting to running training pod after restart...")
+                                    asyncio.create_task(self._reconnect_training(job))
+                                    logger.info("Reconnecting to training pod %s for job %s", job.pod_id, job.id)
+                                else:
+                                    job.status = "failed"
+                                    job.error = "Pod terminated during server restart"
+                            except Exception as e:
+                                logger.warning("Could not check pod %s: %s", job.pod_id, e)
+                                job.status = "failed"
+                                job.error = "Interrupted by server restart"
+                        elif job.status not in ("completed", "failed"):
                             job.status = "failed"
                             job.error = "Interrupted by server restart"
         except Exception as e:
             logger.warning("Failed to load training jobs from DB: %s", e)
             self._loaded_from_db = True
             await self._load_jobs_from_db()
+    async def _reconnect_training(self, job: CloudTrainingJob):
+        """Reconnect to a training pod after server restart and resume log monitoring."""
+        import paramiko
+        ssh = None
+        try:
+            # Get SSH info from RunPod
+            pod = await asyncio.to_thread(runpod.get_pod, job.pod_id)
+            if not pod:
+                raise RuntimeError("Pod not found")
+            runtime = pod.get("runtime") or {}
+            ports = runtime.get("ports") or []
+            ssh_host = ssh_port = None
+            for p in ports:
+                if p.get("privatePort") == 22:
+                    ssh_host = p.get("ip")
+                    ssh_port = p.get("publicPort")
+            if not ssh_host or not ssh_port:
+                raise RuntimeError("SSH port not available")
+            # Connect SSH
+            ssh = paramiko.SSHClient()
+            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+            await asyncio.to_thread(
+                ssh.connect, ssh_host, port=int(ssh_port),
+                username="root", password="runpod", timeout=10,
+            )
+            transport = ssh.get_transport()
+            transport.set_keepalive(30)
+            job._log(f"Reconnected to pod {job.pod_id}")
+            # Check if training is still running
+            log_file = "/tmp/training.log"
+            exit_file = "/tmp/training.exit"
+            pid_file = "/tmp/training.pid"
+            exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
+            if exit_check:
+                # Training already finished while we were disconnected
+                exit_code = int(exit_check)
+                log_tail = await self._ssh_exec(ssh, f"tail -50 {log_file} 2>/dev/null")
+                for line in log_tail.split("\n"):
+                    line = line.strip()
+                    if line:
+                        job._log(line)
+                        self._parse_progress(job, line)
+                if exit_code == 0:
+                    job._log("Training completed while disconnected!")
+                    # Copy LoRA to volume
+                    name = job.name
+                    await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
+                    remote_files = (await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null")).strip()
+                    if remote_files:
+                        remote_output = remote_files.split("\n")[-1].strip()
+                        await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
+                        job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
+                        job.output_path = f"/runpod-volume/loras/{name}.safetensors"
+                    job.status = "completed"
+                    job.progress = 1.0
+                    job.completed_at = time.time()
+                else:
+                    raise RuntimeError(f"Training failed with exit code {exit_code}")
+            else:
+                # Training still running — resume log monitoring
+                pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
+                job._log(f"Training still running (PID: {pid}), resuming monitoring...")
+                last_offset = 0
+                while True:
+                    exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
+                    if exit_check:
+                        exit_code = int(exit_check)
+                        remaining = await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30)
+                        if remaining:
+                            for line in remaining.split("\n"):
+                                line = line.strip()
+                                if line:
+                                    job._log(line)
+                                    self._parse_progress(job, line)
+                        if exit_code == 0:
+                            # Copy LoRA to volume
+                            name = job.name
+                            await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
+                            remote_files = (await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null")).strip()
+                            if remote_files:
+                                remote_output = remote_files.split("\n")[-1].strip()
+                                await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
+                                job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
+                                job.output_path = f"/runpod-volume/loras/{name}.safetensors"
+                            job.status = "completed"
+                            job.progress = 1.0
+                            job.completed_at = time.time()
+                            break
+                        else:
+                            raise RuntimeError(f"Training failed with exit code {exit_code}")
+                    try:
+                        new_output = await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30)
+                        if new_output:
+                            last_offset += len(new_output.encode("utf-8"))
+                            for line in new_output.replace("\r", "\n").split("\n"):
+                                line = line.strip()
+                                if line:
+                                    job._log(line)
+                                    self._parse_progress(job, line)
+                            self._schedule_db_save(job)
+                    except Exception:
+                        pass
+                    await asyncio.sleep(5)
+            job._log("Training complete!")
+        except Exception as e:
+            job.status = "failed"
+            job.error = str(e)
+            job._log(f"Reconnect failed: {e}")
+            logger.error("Training reconnect failed for %s: %s", job.id, e)
+        finally:
+            if ssh:
+                try:
+                    ssh.close()
+                except Exception:
+                    pass
+            # Terminate pod
+            if job.pod_id:
+                try:
+                    await asyncio.to_thread(runpod.terminate_pod, job.pod_id)
+                    job._log("Pod terminated")
+                except Exception:
+                    pass
+            self._schedule_db_save(job)
     def _build_training_command(
         self,
         *,