Spaces:

dippoo
/

content-engine

Running

dippoo Claude Opus 4.6 commited on Feb 18

Commit

3aa914c

1 Parent(s): b341f22

Fix event loop blocking: make all SSH/SFTP calls async

All _ssh_exec calls now use asyncio.to_thread to avoid blocking
the web server during training setup. SFTP put/get also wrapped.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/content_engine/services/runpod_trainer.py +41 -35

src/content_engine/services/runpod_trainer.py CHANGED Viewed

@@ -304,7 +304,7 @@ class RunPodTrainer:
             # If using network volume, symlink to /workspace so all paths work
             if NETWORK_VOLUME_ID:
-                self._ssh_exec(ssh, "mkdir -p /runpod-volume/models && rm -rf /workspace/models 2>/dev/null; ln -sf /runpod-volume/models /workspace/models")
                 job._log("Network volume symlinked to /workspace")
             # Enable keepalive to prevent SSH timeout during uploads
@@ -324,7 +324,7 @@ class RunPodTrainer:
             tmp_dir = Path(tempfile.mkdtemp(prefix="lora_upload_"))
             folder_name = f"10_{trigger_word or 'character'}"
-            self._ssh_exec(ssh, f"mkdir -p /workspace/dataset/{folder_name}")
             for i, img_path in enumerate(image_paths):
                 p = Path(img_path)
                 if p.exists():
@@ -342,7 +342,7 @@ class RunPodTrainer:
                     remote_path = f"/workspace/dataset/{folder_name}/{remote_name}"
                     for attempt in range(3):
                         try:
-                            sftp.put(str(upload_path), remote_path)
                             break
                         except (EOFError, OSError):
                             if attempt == 2:
@@ -355,12 +355,14 @@ class RunPodTrainer:
                     local_caption = p.with_suffix(".txt")
                     if local_caption.exists():
                         remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
-                        sftp.put(str(local_caption), remote_caption)
                     else:
                         # Fallback: create caption from trigger word
                         remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
-                        with sftp.open(remote_caption, "w") as f:
-                            f.write(trigger_word or "")
                     job._log(f"Uploaded {i+1}/{len(image_paths)}: {p.name}")
             # Cleanup temp compressed images
@@ -381,18 +383,18 @@ class RunPodTrainer:
                 install_cmds = []
                 # Check if already present in workspace
-                tuner_exist = self._ssh_exec(ssh, f"test -f {tuner_dir}/pyproject.toml && echo EXISTS || echo MISSING").strip()
                 if tuner_exist == "EXISTS":
                     job._log("musubi-tuner found in workspace")
                 else:
                     # Check volume cache
-                    vol_exist = self._ssh_exec(ssh, "test -f /runpod-volume/musubi-tuner/pyproject.toml && echo EXISTS || echo MISSING").strip()
                     if vol_exist == "EXISTS":
                         job._log("Restoring musubi-tuner from volume cache...")
-                        self._ssh_exec(ssh, f"rm -rf {tuner_dir} 2>/dev/null; cp -r /runpod-volume/musubi-tuner {tuner_dir}")
                     else:
                         job._log("Cloning musubi-tuner from GitHub...")
-                        self._ssh_exec(ssh, f"rm -rf {tuner_dir} /runpod-volume/musubi-tuner 2>/dev/null; true")
                         install_cmds.append(f"cd /workspace && git clone --depth 1 https://github.com/kohya-ss/musubi-tuner.git")
                         # Save to volume for future pods
                         if NETWORK_VOLUME_ID:
@@ -406,7 +408,7 @@ class RunPodTrainer:
                 ])
             else:
                 # SD 1.5 / SDXL / FLUX.1 use sd-scripts
-                scripts_exist = self._ssh_exec(ssh, "test -f /workspace/sd-scripts/setup.py && echo EXISTS || echo MISSING").strip()
                 if scripts_exist == "EXISTS":
                     job._log("Kohya sd-scripts already cached on volume, updating...")
                     install_cmds = [
@@ -423,7 +425,7 @@ class RunPodTrainer:
                     "pip install accelerate lion-pytorch prodigyopt safetensors bitsandbytes xformers 2>&1 | tail -1",
                 ])
             for cmd in install_cmds:
-                out = self._ssh_exec(ssh, cmd, timeout=600)
                 job._log(out[:200] if out else "done")
             # Download base model from HuggingFace (skip if already on network volume)
@@ -432,7 +434,7 @@ class RunPodTrainer:
             model_name = model_cfg.get("name", job.base_model)
             job.progress = 0.1
-            self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)
             if model_type == "flux2":
                 # FLUX.2 models are stored in a directory structure on the volume
@@ -441,9 +443,9 @@ class RunPodTrainer:
                 vae_path = f"{flux2_dir}/ae.safetensors"  # Original BFL format (not diffusers)
                 te_path = f"{flux2_dir}/text_encoder/model-00001-of-00010.safetensors"
-                dit_exists = self._ssh_exec(ssh, f"test -f {dit_path} && echo EXISTS || echo MISSING").strip()
-                vae_exists = self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING").strip()
-                te_exists = self._ssh_exec(ssh, f"test -f {te_path} && echo EXISTS || echo MISSING").strip()
                 if dit_exists != "EXISTS" or te_exists != "EXISTS":
                     missing = []
@@ -456,14 +458,14 @@ class RunPodTrainer:
                 # Download ae.safetensors (original format VAE) if not present
                 if vae_exists != "EXISTS":
                     job._log("Downloading FLUX.2 VAE (ae.safetensors, 336MB)...")
-                    self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)
-                    self._ssh_exec(ssh, f"""python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('black-forest-labs/FLUX.2-dev', 'ae.safetensors', local_dir='{flux2_dir}')
 print('Downloaded ae.safetensors')
 " 2>&1 | tail -5""", timeout=600)
                     # Verify download
-                    vae_check = self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING").strip()
                     if vae_check != "EXISTS":
                         raise RuntimeError("Failed to download ae.safetensors")
                     job._log("VAE downloaded")
@@ -472,12 +474,12 @@ print('Downloaded ae.safetensors')
             else:
                 # SD 1.5 / SDXL / FLUX.1 — download single model file
-                model_exists = self._ssh_exec(ssh, f"test -f /workspace/models/{hf_filename} && echo EXISTS || echo MISSING").strip()
                 if model_exists == "EXISTS":
                     job._log(f"Base model already cached on volume: {model_name}")
                 else:
                     job._log(f"Downloading base model: {model_name}...")
-                    self._ssh_exec(ssh, f"""
                         python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('{hf_repo}', '{hf_filename}', local_dir='/workspace/models')
@@ -486,13 +488,13 @@ hf_hub_download('{hf_repo}', '{hf_filename}', local_dir='/workspace/models')
                 # For FLUX.1, download additional required models (CLIP, T5, VAE)
                 if model_type == "flux":
-                    flux_files_check = self._ssh_exec(ssh, "test -f /workspace/models/clip_l.safetensors && test -f /workspace/models/t5xxl_fp16.safetensors && test -f /workspace/models/ae.safetensors && echo EXISTS || echo MISSING").strip()
                     if flux_files_check == "EXISTS":
                         job._log("FLUX.1 auxiliary models already cached on volume")
                     else:
                         job._log("Downloading FLUX.1 auxiliary models (CLIP, T5, VAE)...")
                         job.progress = 0.12
-                        self._ssh_exec(ssh, """
                             python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('comfyanonymous/flux_text_encoders', 'clip_l.safetensors', local_dir='/workspace/models')
@@ -523,7 +525,7 @@ batch_size = 1
 num_repeats = 10
 resolution = [{resolution}, {resolution}]
 """
-                self._ssh_exec(ssh, f"cat > /workspace/dataset.toml << 'TOMLEOF'\n{toml_content}TOMLEOF")
                 job._log("Created dataset.toml config")
                 # musubi-tuner requires pre-caching latents and text encoder outputs
@@ -542,13 +544,13 @@ resolution = [{resolution}, {resolution}]
                     f" --vae_dtype bfloat16"
                     f" 2>&1 | tee /tmp/cache_latents.log; echo EXIT_CODE=${{PIPESTATUS[0]}}"
                 )
-                out = self._ssh_exec(ssh, cache_latents_cmd, timeout=600)
                 # Get last lines which have the real error
                 last_lines = out.split('\n')[-30:]
                 job._log('\n'.join(last_lines))
                 if "EXIT_CODE=0" not in out:
                     # Fetch the full error log
-                    err_log = self._ssh_exec(ssh, "grep -i 'error\\|exception\\|traceback\\|failed' /tmp/cache_latents.log | tail -10")
                     job._log(f"Cache error details: {err_log}")
                     raise RuntimeError(f"Latent caching failed")
@@ -564,7 +566,7 @@ resolution = [{resolution}, {resolution}]
                     f" --batch_size 1"
                     f" 2>&1; echo EXIT_CODE=$?"
                 )
-                out = self._ssh_exec(ssh, cache_te_cmd, timeout=600)
                 job._log(out[-500:] if out else "done")
                 if "EXIT_CODE=0" not in out:
                     raise RuntimeError(f"Text encoder caching failed: {out[-200:]}")
@@ -598,7 +600,7 @@ resolution = [{resolution}, {resolution}]
             last_flush = time.time()
             while not channel.exit_status_ready() or channel.recv_ready():
                 if channel.recv_ready():
-                    chunk = channel.recv(4096).decode("utf-8", errors="replace")
                     buffer += chunk
                     # Process complete lines (handle both \n and \r for tqdm progress)
                     while "\n" in buffer or "\r" in buffer:
@@ -640,18 +642,18 @@ resolution = [{resolution}, {resolution}]
             # First, copy to network volume for persistence
             job._log("Saving LoRA to network volume...")
-            self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
             remote_output = f"/workspace/output/{name}.safetensors"
             # Find the output file
-            check = self._ssh_exec(ssh, f"test -f {remote_output} && echo EXISTS || echo MISSING").strip()
             if check == "MISSING":
-                remote_files = self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null").strip()
                 if remote_files:
                     remote_output = remote_files.split("\n")[-1].strip()
                 else:
                     raise RuntimeError("No .safetensors output found")
-            self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
             job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
             # Download locally (skip on HF Spaces — limited storage)
@@ -662,7 +664,7 @@ resolution = [{resolution}, {resolution}]
                 job._log("Downloading LoRA to local machine...")
                 LORA_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
                 local_path = LORA_OUTPUT_DIR / f"{name}.safetensors"
-                sftp.get(remote_output, str(local_path))
                 job.output_path = str(local_path)
                 job._log(f"LoRA saved locally to {local_path}")
@@ -982,8 +984,8 @@ resolution = [{resolution}, {resolution}]
         raise RuntimeError(f"Pod did not become ready within {timeout}s")
-    def _ssh_exec(self, ssh, cmd: str, timeout: int = 120) -> str:
-        """Execute a command over SSH and return stdout."""
         _, stdout, stderr = ssh.exec_command(cmd, timeout=timeout)
         out = stdout.read().decode("utf-8", errors="replace")
         err = stderr.read().decode("utf-8", errors="replace")
@@ -992,6 +994,10 @@ resolution = [{resolution}, {resolution}]
             logger.warning("SSH cmd failed (code %d): %s\nstderr: %s", exit_code, cmd[:100], err[:500])
         return out.strip()
     def _parse_progress(self, job: CloudTrainingJob, line: str):
         """Parse Kohya training output for progress info."""
         lower = line.lower()

             # If using network volume, symlink to /workspace so all paths work
             if NETWORK_VOLUME_ID:
+                await self._ssh_exec(ssh, "mkdir -p /runpod-volume/models && rm -rf /workspace/models 2>/dev/null; ln -sf /runpod-volume/models /workspace/models")
                 job._log("Network volume symlinked to /workspace")
             # Enable keepalive to prevent SSH timeout during uploads
             tmp_dir = Path(tempfile.mkdtemp(prefix="lora_upload_"))
             folder_name = f"10_{trigger_word or 'character'}"
+            await self._ssh_exec(ssh, f"mkdir -p /workspace/dataset/{folder_name}")
             for i, img_path in enumerate(image_paths):
                 p = Path(img_path)
                 if p.exists():
                     remote_path = f"/workspace/dataset/{folder_name}/{remote_name}"
                     for attempt in range(3):
                         try:
+                            await asyncio.to_thread(sftp.put, str(upload_path), remote_path)
                             break
                         except (EOFError, OSError):
                             if attempt == 2:
                     local_caption = p.with_suffix(".txt")
                     if local_caption.exists():
                         remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
+                        await asyncio.to_thread(sftp.put, str(local_caption), remote_caption)
                     else:
                         # Fallback: create caption from trigger word
                         remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
+                        def _write_caption():
+                            with sftp.open(remote_caption, "w") as f:
+                                f.write(trigger_word or "")
+                        await asyncio.to_thread(_write_caption)
                     job._log(f"Uploaded {i+1}/{len(image_paths)}: {p.name}")
             # Cleanup temp compressed images
                 install_cmds = []
                 # Check if already present in workspace
+                tuner_exist = await self._ssh_exec(ssh, f"test -f {tuner_dir}/pyproject.toml && echo EXISTS || echo MISSING").strip()
                 if tuner_exist == "EXISTS":
                     job._log("musubi-tuner found in workspace")
                 else:
                     # Check volume cache
+                    vol_exist = await self._ssh_exec(ssh, "test -f /runpod-volume/musubi-tuner/pyproject.toml && echo EXISTS || echo MISSING").strip()
                     if vol_exist == "EXISTS":
                         job._log("Restoring musubi-tuner from volume cache...")
+                        await self._ssh_exec(ssh, f"rm -rf {tuner_dir} 2>/dev/null; cp -r /runpod-volume/musubi-tuner {tuner_dir}")
                     else:
                         job._log("Cloning musubi-tuner from GitHub...")
+                        await self._ssh_exec(ssh, f"rm -rf {tuner_dir} /runpod-volume/musubi-tuner 2>/dev/null; true")
                         install_cmds.append(f"cd /workspace && git clone --depth 1 https://github.com/kohya-ss/musubi-tuner.git")
                         # Save to volume for future pods
                         if NETWORK_VOLUME_ID:
                 ])
             else:
                 # SD 1.5 / SDXL / FLUX.1 use sd-scripts
+                scripts_exist = await self._ssh_exec(ssh, "test -f /workspace/sd-scripts/setup.py && echo EXISTS || echo MISSING").strip()
                 if scripts_exist == "EXISTS":
                     job._log("Kohya sd-scripts already cached on volume, updating...")
                     install_cmds = [
                     "pip install accelerate lion-pytorch prodigyopt safetensors bitsandbytes xformers 2>&1 | tail -1",
                 ])
             for cmd in install_cmds:
+                out = await self._ssh_exec(ssh, cmd, timeout=600)
                 job._log(out[:200] if out else "done")
             # Download base model from HuggingFace (skip if already on network volume)
             model_name = model_cfg.get("name", job.base_model)
             job.progress = 0.1
+            await self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)
             if model_type == "flux2":
                 # FLUX.2 models are stored in a directory structure on the volume
                 vae_path = f"{flux2_dir}/ae.safetensors"  # Original BFL format (not diffusers)
                 te_path = f"{flux2_dir}/text_encoder/model-00001-of-00010.safetensors"
+                dit_exists = await self._ssh_exec(ssh, f"test -f {dit_path} && echo EXISTS || echo MISSING").strip()
+                vae_exists = await self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING").strip()
+                te_exists = await self._ssh_exec(ssh, f"test -f {te_path} && echo EXISTS || echo MISSING").strip()
                 if dit_exists != "EXISTS" or te_exists != "EXISTS":
                     missing = []
                 # Download ae.safetensors (original format VAE) if not present
                 if vae_exists != "EXISTS":
                     job._log("Downloading FLUX.2 VAE (ae.safetensors, 336MB)...")
+                    await self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)
+                    await self._ssh_exec(ssh, f"""python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('black-forest-labs/FLUX.2-dev', 'ae.safetensors', local_dir='{flux2_dir}')
 print('Downloaded ae.safetensors')
 " 2>&1 | tail -5""", timeout=600)
                     # Verify download
+                    vae_check = await self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING").strip()
                     if vae_check != "EXISTS":
                         raise RuntimeError("Failed to download ae.safetensors")
                     job._log("VAE downloaded")
             else:
                 # SD 1.5 / SDXL / FLUX.1 — download single model file
+                model_exists = await self._ssh_exec(ssh, f"test -f /workspace/models/{hf_filename} && echo EXISTS || echo MISSING").strip()
                 if model_exists == "EXISTS":
                     job._log(f"Base model already cached on volume: {model_name}")
                 else:
                     job._log(f"Downloading base model: {model_name}...")
+                    await self._ssh_exec(ssh, f"""
                         python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('{hf_repo}', '{hf_filename}', local_dir='/workspace/models')
                 # For FLUX.1, download additional required models (CLIP, T5, VAE)
                 if model_type == "flux":
+                    flux_files_check = await self._ssh_exec(ssh, "test -f /workspace/models/clip_l.safetensors && test -f /workspace/models/t5xxl_fp16.safetensors && test -f /workspace/models/ae.safetensors && echo EXISTS || echo MISSING").strip()
                     if flux_files_check == "EXISTS":
                         job._log("FLUX.1 auxiliary models already cached on volume")
                     else:
                         job._log("Downloading FLUX.1 auxiliary models (CLIP, T5, VAE)...")
                         job.progress = 0.12
+                        await self._ssh_exec(ssh, """
                             python -c "
 from huggingface_hub import hf_hub_download
 hf_hub_download('comfyanonymous/flux_text_encoders', 'clip_l.safetensors', local_dir='/workspace/models')
 num_repeats = 10
 resolution = [{resolution}, {resolution}]
 """
+                await self._ssh_exec(ssh, f"cat > /workspace/dataset.toml << 'TOMLEOF'\n{toml_content}TOMLEOF")
                 job._log("Created dataset.toml config")
                 # musubi-tuner requires pre-caching latents and text encoder outputs
                     f" --vae_dtype bfloat16"
                     f" 2>&1 | tee /tmp/cache_latents.log; echo EXIT_CODE=${{PIPESTATUS[0]}}"
                 )
+                out = await self._ssh_exec(ssh, cache_latents_cmd, timeout=600)
                 # Get last lines which have the real error
                 last_lines = out.split('\n')[-30:]
                 job._log('\n'.join(last_lines))
                 if "EXIT_CODE=0" not in out:
                     # Fetch the full error log
+                    err_log = await self._ssh_exec(ssh, "grep -i 'error\\|exception\\|traceback\\|failed' /tmp/cache_latents.log | tail -10")
                     job._log(f"Cache error details: {err_log}")
                     raise RuntimeError(f"Latent caching failed")
                     f" --batch_size 1"
                     f" 2>&1; echo EXIT_CODE=$?"
                 )
+                out = await self._ssh_exec(ssh, cache_te_cmd, timeout=600)
                 job._log(out[-500:] if out else "done")
                 if "EXIT_CODE=0" not in out:
                     raise RuntimeError(f"Text encoder caching failed: {out[-200:]}")
             last_flush = time.time()
             while not channel.exit_status_ready() or channel.recv_ready():
                 if channel.recv_ready():
+                    chunk = (await asyncio.to_thread(channel.recv, 4096)).decode("utf-8", errors="replace")
                     buffer += chunk
                     # Process complete lines (handle both \n and \r for tqdm progress)
                     while "\n" in buffer or "\r" in buffer:
             # First, copy to network volume for persistence
             job._log("Saving LoRA to network volume...")
+            await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
             remote_output = f"/workspace/output/{name}.safetensors"
             # Find the output file
+            check = await self._ssh_exec(ssh, f"test -f {remote_output} && echo EXISTS || echo MISSING").strip()
             if check == "MISSING":
+                remote_files = await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null").strip()
                 if remote_files:
                     remote_output = remote_files.split("\n")[-1].strip()
                 else:
                     raise RuntimeError("No .safetensors output found")
+            await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
             job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
             # Download locally (skip on HF Spaces — limited storage)
                 job._log("Downloading LoRA to local machine...")
                 LORA_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
                 local_path = LORA_OUTPUT_DIR / f"{name}.safetensors"
+                await asyncio.to_thread(sftp.get, remote_output, str(local_path))
                 job.output_path = str(local_path)
                 job._log(f"LoRA saved locally to {local_path}")
         raise RuntimeError(f"Pod did not become ready within {timeout}s")
+    def _ssh_exec_sync(self, ssh, cmd: str, timeout: int = 120) -> str:
+        """Execute a command over SSH and return stdout (blocking)."""
         _, stdout, stderr = ssh.exec_command(cmd, timeout=timeout)
         out = stdout.read().decode("utf-8", errors="replace")
         err = stderr.read().decode("utf-8", errors="replace")
             logger.warning("SSH cmd failed (code %d): %s\nstderr: %s", exit_code, cmd[:100], err[:500])
         return out.strip()
+    async def _ssh_exec(self, ssh, cmd: str, timeout: int = 120) -> str:
+        """Execute a command over SSH without blocking the event loop."""
+        return await asyncio.to_thread(self._ssh_exec_sync, ssh, cmd, timeout)
     def _parse_progress(self, job: CloudTrainingJob, line: str):
         """Parse Kohya training output for progress info."""
         lower = line.lower()