Spaces:

dippoo
/

content-engine

Running

File size: 66,476 Bytes

"""RunPod cloud LoRA training — offload training to RunPod GPU pods.

Creates a temporary GPU pod, uploads training images, runs Kohya sd-scripts,
downloads the finished LoRA, then terminates the pod. No local GPU needed.

Supports multiple base models (FLUX, SD 1.5, SDXL) via model registry.

Usage:
    Set RUNPOD_API_KEY in .env
    Select "Cloud (RunPod)" in the training UI
"""

from __future__ import annotations

import asyncio
import logging
import time
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import runpod
import yaml

logger = logging.getLogger(__name__)

import os
from content_engine.config import settings, IS_HF_SPACES
from content_engine.models.database import catalog_session_factory, TrainingJob as TrainingJobDB

LORA_OUTPUT_DIR = settings.paths.lora_dir
if IS_HF_SPACES:
    CONFIG_DIR = Path("/app/config")
else:
    CONFIG_DIR = Path("D:/AI automation/content_engine/config")

# RunPod GPU options (id -> display name, approx cost/hr)
GPU_OPTIONS = {
    # 24GB - SD 1.5, SDXL, FLUX.1 only (NOT enough for FLUX.2)
    "NVIDIA GeForce RTX 3090": "RTX 3090 24GB (~$0.22/hr)",
    "NVIDIA GeForce RTX 4090": "RTX 4090 24GB (~$0.44/hr)",
    "NVIDIA GeForce RTX 5090": "RTX 5090 32GB (~$0.69/hr)",
    "NVIDIA RTX A4000": "RTX A4000 16GB (~$0.20/hr)",
    "NVIDIA RTX A5000": "RTX A5000 24GB (~$0.28/hr)",
    # 48GB+ - Required for FLUX.2 Dev (Mistral text encoder needs ~48GB)
    "NVIDIA RTX A6000": "RTX A6000 48GB (~$0.76/hr)",
    "NVIDIA A40": "A40 48GB (~$0.64/hr)",
    "NVIDIA L40": "L40 48GB (~$0.89/hr)",
    "NVIDIA L40S": "L40S 48GB (~$1.09/hr)",
    "NVIDIA A100 80GB PCIe": "A100 80GB (~$1.89/hr)",
    "NVIDIA A100-SXM4-80GB": "A100 SXM 80GB (~$1.64/hr)",
    "NVIDIA H100 80GB HBM3": "H100 80GB (~$3.89/hr)",
}

DEFAULT_GPU = "NVIDIA GeForce RTX 4090"

# Network volume for persistent model storage (avoids re-downloading models each run)
# Set RUNPOD_VOLUME_ID in .env to use a persistent volume
# Set RUNPOD_VOLUME_DC to the datacenter ID where the volume lives (e.g. "EU-RO-1")
NETWORK_VOLUME_ID = os.environ.get("RUNPOD_VOLUME_ID", "")
NETWORK_VOLUME_DC = os.environ.get("RUNPOD_VOLUME_DC", "")

# Docker image with PyTorch + CUDA pre-installed
DOCKER_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"


def load_model_registry() -> dict[str, dict]:
    """Load training model configurations from config/models.yaml."""
    models_file = CONFIG_DIR / "models.yaml"
    if not models_file.exists():
        logger.warning("Model registry not found: %s", models_file)
        return {}
    with open(models_file) as f:
        config = yaml.safe_load(f)
    return config.get("training_models", {})


@dataclass
class CloudTrainingJob:
    """Tracks state of a RunPod cloud training job."""

    id: str
    name: str
    status: str = "pending"  # pending, creating_pod, uploading, installing, training, downloading, completed, failed
    progress: float = 0.0
    current_epoch: int = 0
    total_epochs: int = 0
    current_step: int = 0
    total_steps: int = 0
    loss: float | None = None
    started_at: float | None = None
    completed_at: float | None = None
    output_path: str | None = None
    error: str | None = None
    log_lines: list[str] = field(default_factory=list)
    pod_id: str | None = None
    gpu_type: str = DEFAULT_GPU
    cost_estimate: str | None = None
    base_model: str = "sd15_realistic"
    model_type: str = "sd15"
    _db_callback: Any = None  # called on state changes to persist to DB

    def _log(self, msg: str):
        self.log_lines.append(msg)
        if len(self.log_lines) > 200:
            self.log_lines = self.log_lines[-200:]
        logger.info("[%s] %s", self.id, msg)
        if self._db_callback:
            self._db_callback(self)


class RunPodTrainer:
    """Manages LoRA training on RunPod cloud GPUs."""

    def __init__(self, api_key: str):
        self._api_key = api_key
        runpod.api_key = api_key
        self._jobs: dict[str, CloudTrainingJob] = {}
        self._model_registry = load_model_registry()
        self._loaded_from_db = False

    @property
    def available(self) -> bool:
        """Check if RunPod is configured."""
        # Re-set module-level key in case uvicorn reload cleared it
        if self._api_key:
            runpod.api_key = self._api_key
        return bool(self._api_key)

    def list_gpu_options(self) -> dict[str, str]:
        return GPU_OPTIONS

    def list_training_models(self) -> dict[str, dict]:
        """List available base models for training with their parameters."""
        return {
            key: {
                "name": cfg.get("name", key),
                "description": cfg.get("description", ""),
                "model_type": cfg.get("model_type", "sd15"),
                "resolution": cfg.get("resolution", 512),
                "learning_rate": cfg.get("learning_rate", 1e-4),
                "network_rank": cfg.get("network_rank", 32),
                "network_alpha": cfg.get("network_alpha", 16),
                "optimizer": cfg.get("optimizer", "AdamW8bit"),
                "lr_scheduler": cfg.get("lr_scheduler", "cosine"),
                "vram_required_gb": cfg.get("vram_required_gb", 8),
                "recommended_images": cfg.get("recommended_images", "15-30 photos"),
            }
            for key, cfg in self._model_registry.items()
        }

    def get_model_config(self, model_key: str) -> dict | None:
        """Get configuration for a specific training model."""
        return self._model_registry.get(model_key)

    async def start_training(
        self,
        *,
        name: str,
        image_paths: list[str],
        trigger_word: str = "",
        base_model: str = "sd15_realistic",
        resolution: int | None = None,
        num_epochs: int = 10,
        max_train_steps: int | None = None,
        learning_rate: float | None = None,
        network_rank: int | None = None,
        network_alpha: int | None = None,
        optimizer: str | None = None,
        save_every_n_epochs: int = 2,
        save_every_n_steps: int = 500,
        gpu_type: str = DEFAULT_GPU,
    ) -> str:
        """Start a cloud training job. Returns job ID.

        Parameters use model registry defaults if not specified.
        """
        job_id = str(uuid.uuid4())[:8]

        # Get model config (fall back to sd15_realistic if not found)
        model_cfg = self._model_registry.get(base_model, self._model_registry.get("sd15_realistic", {}))
        model_type = model_cfg.get("model_type", "sd15")

        # Use provided values or model defaults
        final_resolution = resolution or model_cfg.get("resolution", 512)
        final_lr = learning_rate or model_cfg.get("learning_rate", 1e-4)
        final_rank = network_rank or model_cfg.get("network_rank", 32)
        final_alpha = network_alpha or model_cfg.get("network_alpha", 16)
        final_optimizer = optimizer or model_cfg.get("optimizer", "AdamW8bit")
        final_steps = max_train_steps or model_cfg.get("max_train_steps")

        job = CloudTrainingJob(
            id=job_id,
            name=name,
            status="pending",
            total_epochs=num_epochs,
            total_steps=final_steps,
            gpu_type=gpu_type,
            started_at=time.time(),
            base_model=base_model,
            model_type=model_type,
        )
        self._jobs[job_id] = job
        job._db_callback = self._schedule_db_save
        asyncio.ensure_future(self._save_to_db(job))

        # Launch the full pipeline as a background task
        asyncio.create_task(self._run_cloud_training(
            job=job,
            image_paths=image_paths,
            trigger_word=trigger_word,
            model_cfg=model_cfg,
            resolution=final_resolution,
            num_epochs=num_epochs,
            max_train_steps=final_steps,
            learning_rate=final_lr,
            network_rank=final_rank,
            network_alpha=final_alpha,
            optimizer=final_optimizer,
            save_every_n_epochs=save_every_n_epochs,
            save_every_n_steps=save_every_n_steps,
        ))

        return job_id

    async def _run_cloud_training(
        self,
        job: CloudTrainingJob,
        image_paths: list[str],
        trigger_word: str,
        model_cfg: dict,
        resolution: int,
        num_epochs: int,
        max_train_steps: int | None,
        learning_rate: float,
        network_rank: int,
        network_alpha: int,
        optimizer: str,
        save_every_n_epochs: int,
        save_every_n_steps: int = 500,
    ):
        """Full cloud training pipeline: create pod -> upload -> train -> download -> cleanup."""
        ssh = None
        sftp = None
        model_type = model_cfg.get("model_type", "sd15")
        name = job.name

        try:
            # Step 1: Create pod
            job.status = "creating_pod"
            job._log(f"Creating RunPod with {job.gpu_type}...")

            # Use network volume if configured (persists models across runs)
            pod_kwargs = {
                "container_disk_in_gb": 30,
                "ports": "22/tcp",
                "docker_args": "bash -c 'apt-get update && apt-get install -y openssh-server && mkdir -p /run/sshd && echo root:runpod | chpasswd && /usr/sbin/sshd -o PermitRootLogin=yes && sleep infinity'",
            }
            if NETWORK_VOLUME_ID:
                pod_kwargs["network_volume_id"] = NETWORK_VOLUME_ID
                if NETWORK_VOLUME_DC:
                    pod_kwargs["data_center_id"] = NETWORK_VOLUME_DC
                job._log(f"Using persistent network volume: {NETWORK_VOLUME_ID} (DC: {NETWORK_VOLUME_DC or 'auto'})")
            else:
                pod_kwargs["volume_in_gb"] = 75

            pod = await asyncio.to_thread(
                runpod.create_pod,
                f"lora-train-{job.id}",
                DOCKER_IMAGE,
                job.gpu_type,
                **pod_kwargs,
            )

            job.pod_id = pod["id"]
            job._log(f"Pod created: {job.pod_id}")

            # Wait for pod to be ready and get SSH info
            job._log("Waiting for pod to start...")
            ssh_host, ssh_port = await self._wait_for_pod_ready(job)
            job._log(f"Pod ready at {ssh_host}:{ssh_port}")

            # Step 2: Connect via SSH
            import paramiko
            ssh = paramiko.SSHClient()
            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())

            for attempt in range(30):
                try:
                    await asyncio.to_thread(
                        ssh.connect,
                        ssh_host, port=ssh_port,
                        username="root", password="runpod",
                        timeout=10,
                    )
                    break
                except Exception:
                    if attempt == 29:
                        raise RuntimeError("Could not SSH into pod after 30 attempts")
                    await asyncio.sleep(5)

            job._log("SSH connected")

            # If using network volume, symlink to /workspace so all paths work
            if NETWORK_VOLUME_ID:
                await self._ssh_exec(ssh, "mkdir -p /runpod-volume/models && rm -rf /workspace/models 2>/dev/null; ln -sf /runpod-volume/models /workspace/models")
                job._log("Network volume symlinked to /workspace")

            # Enable keepalive to prevent SSH timeout during uploads
            transport = ssh.get_transport()
            transport.set_keepalive(30)

            sftp = ssh.open_sftp()
            sftp.get_channel().settimeout(300)  # 5 min timeout per file

            # Step 3: Upload training images (compress first to speed up transfer)
            job.status = "uploading"
            resolution = model_cfg.get("resolution", 1024)
            job._log(f"Compressing and uploading {len(image_paths)} training images...")

            import tempfile
            from PIL import Image
            tmp_dir = Path(tempfile.mkdtemp(prefix="lora_upload_"))

            folder_name = f"10_{trigger_word or 'character'}"
            await self._ssh_exec(ssh, f"mkdir -p /workspace/dataset/{folder_name}")
            for i, img_path in enumerate(image_paths):
                p = Path(img_path)
                if p.exists():
                    # Resize and convert to JPEG to reduce upload size
                    try:
                        img = Image.open(p)
                        img.thumbnail((resolution * 2, resolution * 2), Image.LANCZOS)
                        compressed = tmp_dir / f"{p.stem}.jpg"
                        img.save(compressed, "JPEG", quality=95)
                        upload_path = compressed
                    except Exception:
                        upload_path = p  # fallback to original

                    remote_name = f"{p.stem}.jpg" if upload_path.suffix == ".jpg" else p.name
                    remote_path = f"/workspace/dataset/{folder_name}/{remote_name}"
                    for attempt in range(3):
                        try:
                            await asyncio.to_thread(sftp.put, str(upload_path), remote_path)
                            break
                        except (EOFError, OSError):
                            if attempt == 2:
                                raise
                            job._log(f"Upload retry {attempt+1} for {p.name}")
                            sftp.close()
                            sftp = ssh.open_sftp()
                            sftp.get_channel().settimeout(300)
                    # Upload matching caption .txt file if it exists locally
                    local_caption = p.with_suffix(".txt")
                    if local_caption.exists():
                        remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
                        await asyncio.to_thread(sftp.put, str(local_caption), remote_caption)
                    else:
                        # Fallback: create caption from trigger word
                        remote_caption = f"/workspace/dataset/{folder_name}/{p.stem}.txt"
                        def _write_caption():
                            with sftp.open(remote_caption, "w") as f:
                                f.write(trigger_word or "")
                        await asyncio.to_thread(_write_caption)
                    job._log(f"Uploaded {i+1}/{len(image_paths)}: {p.name}")

            # Cleanup temp compressed images
            import shutil
            shutil.rmtree(tmp_dir, ignore_errors=True)

            job._log("Images uploaded")

            # Step 4: Install training framework on the pod (skip if cached on volume)
            job.status = "installing"
            job.progress = 0.05

            training_framework = model_cfg.get("training_framework", "sd-scripts")

            if training_framework == "musubi-tuner":
                # FLUX.2 uses musubi-tuner (Kohya's newer framework)
                tuner_dir = "/workspace/musubi-tuner"
                install_cmds = []

                # Check if already present in workspace
                tuner_exist = (await self._ssh_exec(ssh, f"test -f {tuner_dir}/pyproject.toml && echo EXISTS || echo MISSING")).strip()
                if tuner_exist == "EXISTS":
                    job._log("musubi-tuner found in workspace")
                else:
                    # Check volume cache
                    vol_exist = (await self._ssh_exec(ssh, "test -f /runpod-volume/musubi-tuner/pyproject.toml && echo EXISTS || echo MISSING")).strip()
                    if vol_exist == "EXISTS":
                        job._log("Restoring musubi-tuner from volume cache...")
                        await self._ssh_exec(ssh, f"rm -rf {tuner_dir} 2>/dev/null; cp -r /runpod-volume/musubi-tuner {tuner_dir}")
                    else:
                        job._log("Cloning musubi-tuner from GitHub...")
                        await self._ssh_exec(ssh, f"rm -rf {tuner_dir} /runpod-volume/musubi-tuner 2>/dev/null; true")
                        install_cmds.append(f"cd /workspace && git clone --depth 1 https://github.com/kohya-ss/musubi-tuner.git")
                        # Save to volume for future pods
                        if NETWORK_VOLUME_ID:
                            install_cmds.append(f"cp -r {tuner_dir} /runpod-volume/musubi-tuner")

                # Always install pip deps (they are pod-local, lost on every new pod)
                job._log("Installing pip dependencies (accelerate, torch, etc.)...")
                install_cmds.extend([
                    f"cd {tuner_dir} && pip install -e . 2>&1 | tail -5",
                    "pip install accelerate lion-pytorch prodigyopt safetensors bitsandbytes 2>&1 | tail -5",
                ])
            else:
                # SD 1.5 / SDXL / FLUX.1 use sd-scripts
                scripts_exist = (await self._ssh_exec(ssh, "test -f /workspace/sd-scripts/setup.py && echo EXISTS || echo MISSING")).strip()
                if scripts_exist == "EXISTS":
                    job._log("Kohya sd-scripts already cached on volume, updating...")
                    install_cmds = [
                        "cd /workspace/sd-scripts && git pull 2>&1 | tail -1",
                    ]
                else:
                    job._log("Installing Kohya sd-scripts (this takes a few minutes)...")
                    install_cmds = [
                        "cd /workspace && git clone --depth 1 https://github.com/kohya-ss/sd-scripts.git",
                    ]
                # Always install pip deps (pod-local, lost on new pods)
                install_cmds.extend([
                    "cd /workspace/sd-scripts && pip install -r requirements.txt 2>&1 | tail -1",
                    "pip install accelerate lion-pytorch prodigyopt safetensors bitsandbytes xformers 2>&1 | tail -1",
                ])
            for cmd in install_cmds:
                out = await self._ssh_exec(ssh, cmd, timeout=600)
                job._log(out[:200] if out else "done")

            # Download base model from HuggingFace (skip if already on network volume)
            hf_repo = model_cfg.get("hf_repo", "SG161222/Realistic_Vision_V5.1_noVAE")
            hf_filename = model_cfg.get("hf_filename", "Realistic_Vision_V5.1_fp16-no-ema.safetensors")
            model_name = model_cfg.get("name", job.base_model)

            job.progress = 0.1
            await self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)

            if model_type == "flux2":
                # FLUX.2 models are stored in a directory structure on the volume
                flux2_dir = "/workspace/models/FLUX.2-dev"
                dit_path = f"{flux2_dir}/flux2-dev.safetensors"
                vae_path = f"{flux2_dir}/ae.safetensors"  # Original BFL format (not diffusers)
                te_path = f"{flux2_dir}/text_encoder/model-00001-of-00010.safetensors"

                dit_exists = (await self._ssh_exec(ssh, f"test -f {dit_path} && echo EXISTS || echo MISSING")).strip()
                vae_exists = (await self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING")).strip()
                te_exists = (await self._ssh_exec(ssh, f"test -f {te_path} && echo EXISTS || echo MISSING")).strip()

                if dit_exists != "EXISTS" or te_exists != "EXISTS":
                    missing = []
                    if dit_exists != "EXISTS":
                        missing.append("DiT")
                    if te_exists != "EXISTS":
                        missing.append("text encoder")
                    raise RuntimeError(f"FLUX.2 Dev missing on volume: {', '.join(missing)}. Please download models to the network volume first.")

                # Download ae.safetensors (original format VAE) if not present
                if vae_exists != "EXISTS":
                    job._log("Downloading FLUX.2 VAE (ae.safetensors, 336MB)...")
                    await self._ssh_exec(ssh, """pip install huggingface_hub 2>&1 | tail -1""", timeout=120)
                    await self._ssh_exec(ssh, f"""python -c "
from huggingface_hub import hf_hub_download
hf_hub_download('black-forest-labs/FLUX.2-dev', 'ae.safetensors', local_dir='{flux2_dir}')
print('Downloaded ae.safetensors')
" 2>&1 | tail -5""", timeout=600)
                    # Verify download
                    vae_check = (await self._ssh_exec(ssh, f"test -f {vae_path} && echo EXISTS || echo MISSING")).strip()
                    if vae_check != "EXISTS":
                        raise RuntimeError("Failed to download ae.safetensors")
                    job._log("VAE downloaded")

                job._log("FLUX.2 Dev models ready")

            elif model_type == "wan22":
                # WAN 2.2 T2V — 4 model files stored in /workspace/models/WAN2.2/
                wan_dir = "/workspace/models/WAN2.2"
                await self._ssh_exec(ssh, f"mkdir -p {wan_dir}")

                wan_files = {
                    "DiT low-noise": {
                        "path": f"{wan_dir}/wan2.2_t2v_low_noise_14B_fp16.safetensors",
                        "repo": "Comfy-Org/Wan_2.2_ComfyUI_Repackaged",
                        "filename": "split_files/diffusion_models/wan2.2_t2v_low_noise_14B_fp16.safetensors",
                    },
                    "DiT high-noise": {
                        "path": f"{wan_dir}/wan2.2_t2v_high_noise_14B_fp16.safetensors",
                        "repo": "Comfy-Org/Wan_2.2_ComfyUI_Repackaged",
                        "filename": "split_files/diffusion_models/wan2.2_t2v_high_noise_14B_fp16.safetensors",
                    },
                    "VAE": {
                        "path": f"{wan_dir}/Wan2.1_VAE.pth",
                        "repo": "Wan-AI/Wan2.1-I2V-14B-720P",
                        "filename": "Wan2.1_VAE.pth",
                    },
                    "T5 text encoder": {
                        "path": f"{wan_dir}/models_t5_umt5-xxl-enc-bf16.pth",
                        "repo": "Wan-AI/Wan2.1-I2V-14B-720P",
                        "filename": "models_t5_umt5-xxl-enc-bf16.pth",
                    },
                }

                for label, info in wan_files.items():
                    exists = (await self._ssh_exec(ssh, f"test -f {info['path']} && echo EXISTS || echo MISSING")).strip()
                    if exists == "EXISTS":
                        job._log(f"WAN 2.2 {label} already cached")
                    else:
                        job._log(f"Downloading WAN 2.2 {label}...")
                        await self._ssh_exec(ssh, f"""python -c "
from huggingface_hub import hf_hub_download
hf_hub_download('{info['repo']}', '{info['filename']}', local_dir='{wan_dir}')
# hf_hub_download puts files in subdirs matching the filename path — move to root
import os, shutil
downloaded = os.path.join('{wan_dir}', '{info['filename']}')
target = '{info['path']}'
if os.path.exists(downloaded) and downloaded != target:
    shutil.move(downloaded, target)
print('Downloaded {label}')
" 2>&1 | tail -5""", timeout=1800)
                        # Verify
                        check = (await self._ssh_exec(ssh, f"test -f {info['path']} && echo EXISTS || echo MISSING")).strip()
                        if check != "EXISTS":
                            raise RuntimeError(f"Failed to download WAN 2.2 {label}")

                job._log("WAN 2.2 models ready")

            else:
                # SD 1.5 / SDXL / FLUX.1 — download single model file
                model_exists = (await self._ssh_exec(ssh, f"test -f /workspace/models/{hf_filename} && echo EXISTS || echo MISSING")).strip()
                if model_exists == "EXISTS":
                    job._log(f"Base model already cached on volume: {model_name}")
                else:
                    job._log(f"Downloading base model: {model_name}...")
                    await self._ssh_exec(ssh, f"""
                        python -c "
from huggingface_hub import hf_hub_download
hf_hub_download('{hf_repo}', '{hf_filename}', local_dir='/workspace/models')
" 2>&1 | tail -5
                    """, timeout=1200)

                # For FLUX.1, download additional required models (CLIP, T5, VAE)
                if model_type == "flux":
                    flux_files_check = (await self._ssh_exec(ssh, "test -f /workspace/models/clip_l.safetensors && test -f /workspace/models/t5xxl_fp16.safetensors && test -f /workspace/models/ae.safetensors && echo EXISTS || echo MISSING")).strip()
                    if flux_files_check == "EXISTS":
                        job._log("FLUX.1 auxiliary models already cached on volume")
                    else:
                        job._log("Downloading FLUX.1 auxiliary models (CLIP, T5, VAE)...")
                        job.progress = 0.12
                        await self._ssh_exec(ssh, """
                            python -c "
from huggingface_hub import hf_hub_download
hf_hub_download('comfyanonymous/flux_text_encoders', 'clip_l.safetensors', local_dir='/workspace/models')
hf_hub_download('comfyanonymous/flux_text_encoders', 't5xxl_fp16.safetensors', local_dir='/workspace/models')
hf_hub_download('black-forest-labs/FLUX.1-dev', 'ae.safetensors', local_dir='/workspace/models')
" 2>&1 | tail -5
                        """, timeout=1200)

            job._log("Base model ready")
            job.progress = 0.15

            # Step 5: Run training
            job.status = "training"
            job._log(f"Starting {model_type.upper()} LoRA training...")

            if model_type == "flux2":
                model_path = f"/workspace/models/FLUX.2-dev/flux2-dev.safetensors"
            elif model_type == "wan22":
                model_path = "/workspace/models/WAN2.2/wan2.2_t2v_low_noise_14B_fp16.safetensors"
            else:
                model_path = f"/workspace/models/{hf_filename}"

            # For musubi-tuner, create TOML dataset config
            if training_framework == "musubi-tuner":
                folder_name = f"10_{trigger_word or 'character'}"
                toml_content = f"""[[datasets]]
image_directory = "/workspace/dataset/{folder_name}"
caption_extension = ".txt"
batch_size = 1
num_repeats = 10
resolution = [{resolution}, {resolution}]
"""
                await self._ssh_exec(ssh, f"cat > /workspace/dataset.toml << 'TOMLEOF'\n{toml_content}TOMLEOF")
                job._log("Created dataset.toml config")

                # musubi-tuner requires pre-caching latents and text encoder outputs
                if model_type == "wan22":
                    wan_dir = "/workspace/models/WAN2.2"
                    vae_path = f"{wan_dir}/Wan2.1_VAE.pth"
                    te_path = f"{wan_dir}/models_t5_umt5-xxl-enc-bf16.pth"

                    job._log("Caching WAN 2.2 latents (VAE encoding)...")
                    job.progress = 0.15
                    self._schedule_db_save(job)
                    cache_latents_cmd = (
                        f"cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
                        f" python src/musubi_tuner/wan_cache_latents.py"
                        f" --dataset_config /workspace/dataset.toml"
                        f" --vae {vae_path}"
                        f" --vae_dtype bfloat16"
                        f" 2>&1 | tee /tmp/cache_latents.log; echo EXIT_CODE=${{PIPESTATUS[0]}}"
                    )
                    out = await self._ssh_exec(ssh, cache_latents_cmd, timeout=600)
                    last_lines = out.split('\n')[-30:]
                    job._log('\n'.join(last_lines))
                    if "EXIT_CODE=0" not in out:
                        err_log = await self._ssh_exec(ssh, "grep -i 'error\\|exception\\|traceback\\|failed' /tmp/cache_latents.log | tail -10")
                        job._log(f"Cache error details: {err_log}")
                        raise RuntimeError(f"WAN latent caching failed")

                    job._log("Caching WAN 2.2 text encoder outputs (T5)...")
                    job.progress = 0.25
                    self._schedule_db_save(job)
                    cache_te_cmd = (
                        f"cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
                        f" python src/musubi_tuner/wan_cache_text_encoder_outputs.py"
                        f" --dataset_config /workspace/dataset.toml"
                        f" --t5 {te_path}"
                        f" --batch_size 16"
                        f" 2>&1; echo EXIT_CODE=$?"
                    )
                    out = await self._ssh_exec(ssh, cache_te_cmd, timeout=600)
                    job._log(out[-500:] if out else "done")
                    if "EXIT_CODE=0" not in out:
                        raise RuntimeError(f"WAN text encoder caching failed: {out[-200:]}")

                else:
                    # FLUX.2 caching
                    flux2_dir = "/workspace/models/FLUX.2-dev"
                    vae_path = f"{flux2_dir}/ae.safetensors"
                    te_path = f"{flux2_dir}/text_encoder/model-00001-of-00010.safetensors"

                    job._log("Caching latents (VAE encoding)...")
                    job.progress = 0.15
                    self._schedule_db_save(job)
                    cache_latents_cmd = (
                        f"cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python src/musubi_tuner/flux_2_cache_latents.py"
                        f" --dataset_config /workspace/dataset.toml"
                        f" --vae {vae_path}"
                        f" --model_version dev"
                        f" --vae_dtype bfloat16"
                        f" 2>&1 | tee /tmp/cache_latents.log; echo EXIT_CODE=${{PIPESTATUS[0]}}"
                    )
                    out = await self._ssh_exec(ssh, cache_latents_cmd, timeout=600)
                    last_lines = out.split('\n')[-30:]
                    job._log('\n'.join(last_lines))
                    if "EXIT_CODE=0" not in out:
                        err_log = await self._ssh_exec(ssh, "grep -i 'error\\|exception\\|traceback\\|failed' /tmp/cache_latents.log | tail -10")
                        job._log(f"Cache error details: {err_log}")
                        raise RuntimeError(f"Latent caching failed")

                    job._log("Caching text encoder outputs (bf16)...")
                    job.progress = 0.25
                    self._schedule_db_save(job)
                    cache_te_cmd = (
                        f"cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
                        f" python src/musubi_tuner/flux_2_cache_text_encoder_outputs.py"
                        f" --dataset_config /workspace/dataset.toml"
                        f" --text_encoder {te_path}"
                        f" --model_version dev"
                        f" --batch_size 1"
                        f" 2>&1; echo EXIT_CODE=$?"
                    )
                    out = await self._ssh_exec(ssh, cache_te_cmd, timeout=600)
                    job._log(out[-500:] if out else "done")
                    if "EXIT_CODE=0" not in out:
                        raise RuntimeError(f"Text encoder caching failed: {out[-200:]}")

            # Build training command based on model type
            train_cmd = self._build_training_command(
                model_type=model_type,
                model_path=model_path,
                name=name,
                resolution=resolution,
                num_epochs=num_epochs,
                max_train_steps=max_train_steps,
                learning_rate=learning_rate,
                network_rank=network_rank,
                network_alpha=network_alpha,
                optimizer=optimizer,
                save_every_n_epochs=save_every_n_epochs,
                save_every_n_steps=save_every_n_steps,
                model_cfg=model_cfg,
                gpu_type=job.gpu_type,
            )

            # Execute training in a detached process (survives SSH disconnect)
            job._log("Starting training (detached — survives disconnects)...")
            log_file = "/tmp/training.log"
            pid_file = "/tmp/training.pid"
            exit_file = "/tmp/training.exit"
            await self._ssh_exec(ssh, f"rm -f {log_file} {exit_file} {pid_file}")

            # Write training command to a script file (avoids quoting issues with nohup)
            script_file = "/tmp/train.sh"
            await self._ssh_exec(ssh, f"cat > {script_file} << 'TRAINEOF'\n#!/bin/bash\n{train_cmd} > {log_file} 2>&1\necho $? > {exit_file}\nTRAINEOF")
            await self._ssh_exec(ssh, f"chmod +x {script_file}")

            # Verify script was written
            script_check = (await self._ssh_exec(ssh, f"wc -l < {script_file}")).strip()
            job._log(f"Training script written ({script_check} lines)")

            # Launch fully detached: close all FDs so SSH channel doesn't hang
            await self._ssh_exec(
                ssh,
                f"setsid {script_file} </dev/null >/dev/null 2>&1 &\necho $! > {pid_file}",
                timeout=15,
            )
            await asyncio.sleep(3)
            pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
            if not pid:
                # Fallback: find the process by script name
                pid = (await self._ssh_exec(ssh, "pgrep -f train.sh 2>/dev/null | head -1")).strip()
            job._log(f"Training PID: {pid}")

            # Verify process is actually running
            if pid:
                running = (await self._ssh_exec(ssh, f"kill -0 {pid} 2>&1 && echo RUNNING || echo DEAD")).strip()
                job._log(f"Process status: {running}")
                if "DEAD" in running:
                    # Check if it already wrote an exit code (fast failure)
                    early_exit = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
                    early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
                    raise RuntimeError(f"Training process died immediately. Exit: {early_exit}\nLog: {early_log}")
            else:
                early_log = (await self._ssh_exec(ssh, f"cat {log_file} 2>/dev/null | tail -20")).strip()
                raise RuntimeError(f"Failed to start training process.\nLog: {early_log}")

            # Monitor the log file (reconnect-safe)
            last_offset = 0
            while True:
                # Check if training finished
                exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
                if exit_check:
                    exit_code = int(exit_check)
                    # Read remaining log
                    remaining = (await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30))
                    if remaining:
                        for line in remaining.split("\n"):
                            line = line.strip()
                            if line:
                                job._log(line)
                                self._parse_progress(job, line)
                    if exit_code != 0:
                        raise RuntimeError(f"Training failed with exit code {exit_code}")
                    break

                # Read new log output
                try:
                    new_output = (await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30))
                    if new_output:
                        last_offset += len(new_output.encode("utf-8"))
                        for line in new_output.replace("\r", "\n").split("\n"):
                            line = line.strip()
                            if not line:
                                continue
                            job._log(line)
                            self._parse_progress(job, line)
                        self._schedule_db_save(job)
                except Exception:
                    job._log("Log read failed, retrying...")

                await asyncio.sleep(5)

            job._log("Training completed on RunPod!")
            job.progress = 0.9

            # Step 6: Save LoRA to network volume and download locally
            job.status = "downloading"

            # First, copy to network volume for persistence
            job._log("Saving LoRA to network volume...")
            await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
            remote_output = f"/workspace/output/{name}.safetensors"
            # Find the output file
            check = (await self._ssh_exec(ssh, f"test -f {remote_output} && echo EXISTS || echo MISSING")).strip()
            if check == "MISSING":
                remote_files = (await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null")).strip()
                if remote_files:
                    remote_output = remote_files.split("\n")[-1].strip()
                else:
                    raise RuntimeError("No .safetensors output found")

            await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
            job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")

            # Also save intermediate checkpoints (step 500, 1000, 1500, etc.)
            checkpoint_files = (await self._ssh_exec(ssh, f"ls /workspace/output/{name}-step*.safetensors 2>/dev/null")).strip()
            if checkpoint_files:
                for ckpt in checkpoint_files.split("\n"):
                    ckpt = ckpt.strip()
                    if ckpt:
                        ckpt_name = ckpt.split("/")[-1]
                        await self._ssh_exec(ssh, f"cp {ckpt} /runpod-volume/loras/{ckpt_name}")
                        job._log(f"Checkpoint saved: /runpod-volume/loras/{ckpt_name}")

            # Download locally (skip on HF Spaces — limited storage)
            if IS_HF_SPACES:
                job.output_path = f"/runpod-volume/loras/{name}.safetensors"
                job._log("LoRA saved on RunPod volume (ready for generation)")
            else:
                job._log("Downloading LoRA to local machine...")
                LORA_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
                local_path = LORA_OUTPUT_DIR / f"{name}.safetensors"
                await asyncio.to_thread(sftp.get, remote_output, str(local_path))
                job.output_path = str(local_path)
                job._log(f"LoRA saved locally to {local_path}")

            # Done!
            job.status = "completed"
            job.progress = 1.0
            job.completed_at = time.time()
            elapsed = (job.completed_at - job.started_at) / 60
            job._log(f"Cloud training complete in {elapsed:.1f} minutes")

        except Exception as e:
            job.status = "failed"
            job.error = str(e)
            job._log(f"ERROR: {e}")
            logger.error("Cloud training failed: %s", e, exc_info=True)

        finally:
            # Cleanup: close SSH and terminate pod
            if sftp:
                try:
                    sftp.close()
                except Exception:
                    pass
            if ssh:
                try:
                    ssh.close()
                except Exception:
                    pass

            # Clean up local training images (saves HF Spaces storage)
            if image_paths:
                import shutil
                first_image_dir = Path(image_paths[0]).parent
                if first_image_dir.exists() and "training_uploads" in str(first_image_dir):
                    shutil.rmtree(first_image_dir, ignore_errors=True)

            if job.pod_id:
                try:
                    job._log("Terminating RunPod...")
                    await asyncio.to_thread(runpod.terminate_pod, job.pod_id)
                    job._log("Pod terminated")
                except Exception as e:
                    job._log(f"Warning: Failed to terminate pod {job.pod_id}: {e}")

    def _schedule_db_save(self, job: CloudTrainingJob):
        """Schedule a DB save (non-blocking)."""
        try:
            asyncio.get_event_loop().create_task(self._save_to_db(job))
        except RuntimeError:
            pass  # no event loop

    async def _save_to_db(self, job: CloudTrainingJob):
        """Persist job state to database."""
        try:
            from sqlalchemy import text
            async with catalog_session_factory() as session:
                # Use raw INSERT OR REPLACE for SQLite upsert
                await session.execute(
                    text("""INSERT OR REPLACE INTO training_jobs
                        (id, name, status, progress, current_epoch, total_epochs,
                         current_step, total_steps, loss, started_at, completed_at,
                         output_path, error, log_text, pod_id, gpu_type, backend,
                         base_model, model_type, created_at)
                        VALUES (:id, :name, :status, :progress, :current_epoch, :total_epochs,
                                :current_step, :total_steps, :loss, :started_at, :completed_at,
                                :output_path, :error, :log_text, :pod_id, :gpu_type, :backend,
                                :base_model, :model_type, COALESCE((SELECT created_at FROM training_jobs WHERE id = :id), CURRENT_TIMESTAMP))
                    """),
                    {
                        "id": job.id, "name": job.name, "status": job.status,
                        "progress": job.progress, "current_epoch": job.current_epoch,
                        "total_epochs": job.total_epochs, "current_step": job.current_step,
                        "total_steps": job.total_steps, "loss": job.loss,
                        "started_at": job.started_at, "completed_at": job.completed_at,
                        "output_path": job.output_path, "error": job.error,
                        "log_text": "\n".join(job.log_lines[-200:]),
                        "pod_id": job.pod_id, "gpu_type": job.gpu_type,
                        "backend": "runpod", "base_model": job.base_model,
                        "model_type": job.model_type,
                    }
                )
                await session.commit()
        except Exception as e:
            logger.warning("Failed to save training job to DB: %s", e)

    async def _load_jobs_from_db(self):
        """Load previously saved jobs from database on startup."""
        try:
            from sqlalchemy import select
            async with catalog_session_factory() as session:
                result = await session.execute(
                    select(TrainingJobDB).order_by(TrainingJobDB.created_at.desc()).limit(20)
                )
                db_jobs = result.scalars().all()
                for db_job in db_jobs:
                    if db_job.id not in self._jobs:
                        job = CloudTrainingJob(
                            id=db_job.id,
                            name=db_job.name,
                            status=db_job.status,
                            progress=db_job.progress or 0.0,
                            current_epoch=db_job.current_epoch or 0,
                            total_epochs=db_job.total_epochs or 0,
                            current_step=db_job.current_step or 0,
                            total_steps=db_job.total_steps or 0,
                            loss=db_job.loss,
                            started_at=db_job.started_at,
                            completed_at=db_job.completed_at,
                            output_path=db_job.output_path,
                            error=db_job.error,
                            log_lines=(db_job.log_text or "").split("\n") if db_job.log_text else [],
                            pod_id=db_job.pod_id,
                            gpu_type=db_job.gpu_type or DEFAULT_GPU,
                            base_model=db_job.base_model or "sd15_realistic",
                            model_type=db_job.model_type or "sd15",
                        )
                        self._jobs[db_job.id] = job
                        # Try to reconnect to running training pods
                        if job.status not in ("completed", "failed") and job.pod_id:
                            try:
                                pod = await asyncio.to_thread(runpod.get_pod, job.pod_id)
                                if pod and pod.get("desiredStatus") == "RUNNING":
                                    job.status = "training"
                                    job.error = None
                                    job._log("Reconnecting to running training pod after restart...")
                                    asyncio.create_task(self._reconnect_training(job))
                                    logger.info("Reconnecting to training pod %s for job %s", job.pod_id, job.id)
                                else:
                                    job.status = "failed"
                                    job.error = "Pod terminated during server restart"
                            except Exception as e:
                                logger.warning("Could not check pod %s: %s", job.pod_id, e)
                                job.status = "failed"
                                job.error = "Interrupted by server restart"
                        elif job.status not in ("completed", "failed"):
                            job.status = "failed"
                            job.error = "Interrupted by server restart"
        except Exception as e:
            logger.warning("Failed to load training jobs from DB: %s", e)

    async def ensure_loaded(self):
        """Load jobs from DB on first access."""
        if not self._loaded_from_db:
            self._loaded_from_db = True
            await self._load_jobs_from_db()

    async def _reconnect_training(self, job: CloudTrainingJob):
        """Reconnect to a training pod after server restart and resume log monitoring."""
        import paramiko
        ssh = None
        try:
            # Get SSH info from RunPod
            pod = await asyncio.to_thread(runpod.get_pod, job.pod_id)
            if not pod:
                raise RuntimeError("Pod not found")

            runtime = pod.get("runtime") or {}
            ports = runtime.get("ports") or []
            ssh_host = ssh_port = None
            for p in ports:
                if p.get("privatePort") == 22:
                    ssh_host = p.get("ip")
                    ssh_port = p.get("publicPort")

            if not ssh_host or not ssh_port:
                raise RuntimeError("SSH port not available")

            # Connect SSH
            ssh = paramiko.SSHClient()
            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            await asyncio.to_thread(
                ssh.connect, ssh_host, port=int(ssh_port),
                username="root", password="runpod", timeout=10,
            )
            transport = ssh.get_transport()
            transport.set_keepalive(30)
            job._log(f"Reconnected to pod {job.pod_id}")

            # Check if training is still running
            log_file = "/tmp/training.log"
            exit_file = "/tmp/training.exit"
            pid_file = "/tmp/training.pid"

            exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
            if exit_check:
                # Training already finished while we were disconnected
                exit_code = int(exit_check)
                log_tail = await self._ssh_exec(ssh, f"tail -50 {log_file} 2>/dev/null")
                for line in log_tail.split("\n"):
                    line = line.strip()
                    if line:
                        job._log(line)
                        self._parse_progress(job, line)

                if exit_code == 0:
                    job._log("Training completed while disconnected!")
                    # Copy LoRA to volume
                    name = job.name
                    await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
                    remote_files = (await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null")).strip()
                    if remote_files:
                        remote_output = remote_files.split("\n")[-1].strip()
                        await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
                        job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
                        job.output_path = f"/runpod-volume/loras/{name}.safetensors"

                    job.status = "completed"
                    job.progress = 1.0
                    job.completed_at = time.time()
                else:
                    raise RuntimeError(f"Training failed with exit code {exit_code}")
            else:
                # Training still running — resume log monitoring
                pid = (await self._ssh_exec(ssh, f"cat {pid_file} 2>/dev/null")).strip()
                job._log(f"Training still running (PID: {pid}), resuming monitoring...")

                last_offset = 0
                while True:
                    exit_check = (await self._ssh_exec(ssh, f"cat {exit_file} 2>/dev/null")).strip()
                    if exit_check:
                        exit_code = int(exit_check)
                        remaining = await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30)
                        if remaining:
                            for line in remaining.split("\n"):
                                line = line.strip()
                                if line:
                                    job._log(line)
                                    self._parse_progress(job, line)

                        if exit_code == 0:
                            # Copy LoRA to volume
                            name = job.name
                            await self._ssh_exec(ssh, "mkdir -p /runpod-volume/loras")
                            remote_files = (await self._ssh_exec(ssh, "ls /workspace/output/*.safetensors 2>/dev/null")).strip()
                            if remote_files:
                                remote_output = remote_files.split("\n")[-1].strip()
                                await self._ssh_exec(ssh, f"cp {remote_output} /runpod-volume/loras/{name}.safetensors")
                                job._log(f"LoRA saved to volume: /runpod-volume/loras/{name}.safetensors")
                                job.output_path = f"/runpod-volume/loras/{name}.safetensors"

                            job.status = "completed"
                            job.progress = 1.0
                            job.completed_at = time.time()
                            break
                        else:
                            raise RuntimeError(f"Training failed with exit code {exit_code}")

                    try:
                        new_output = await self._ssh_exec(ssh, f"tail -c +{last_offset + 1} {log_file} 2>/dev/null", timeout=30)
                        if new_output:
                            last_offset += len(new_output.encode("utf-8"))
                            for line in new_output.replace("\r", "\n").split("\n"):
                                line = line.strip()
                                if line:
                                    job._log(line)
                                    self._parse_progress(job, line)
                            self._schedule_db_save(job)
                    except Exception:
                        pass
                    await asyncio.sleep(5)

            job._log("Training complete!")

        except Exception as e:
            job.status = "failed"
            job.error = str(e)
            job._log(f"Reconnect failed: {e}")
            logger.error("Training reconnect failed for %s: %s", job.id, e)

        finally:
            if ssh:
                try:
                    ssh.close()
                except Exception:
                    pass
            # Terminate pod
            if job.pod_id:
                try:
                    await asyncio.to_thread(runpod.terminate_pod, job.pod_id)
                    job._log("Pod terminated")
                except Exception:
                    pass
            self._schedule_db_save(job)

    def _build_training_command(
        self,
        *,
        model_type: str,
        model_path: str,
        name: str,
        resolution: int,
        num_epochs: int,
        max_train_steps: int | None,
        learning_rate: float,
        network_rank: int,
        network_alpha: int,
        optimizer: str,
        save_every_n_epochs: int,
        save_every_n_steps: int = 500,
        model_cfg: dict,
        gpu_type: str = "",
    ) -> str:
        """Build the training command based on model type."""

        # Common parameters
        base_args = f"""
            --train_data_dir="/workspace/dataset" \
            --output_dir="/workspace/output" \
            --output_name="{name}" \
            --resolution={resolution} \
            --train_batch_size=1 \
            --learning_rate={learning_rate} \
            --network_module=networks.lora \
            --network_dim={network_rank} \
            --network_alpha={network_alpha} \
            --optimizer_type={optimizer} \
            --save_every_n_epochs={save_every_n_epochs} \
            --mixed_precision=fp16 \
            --seed=42 \
            --caption_extension=.txt \
            --cache_latents \
            --enable_bucket \
            --save_model_as=safetensors"""

        # Steps vs epochs
        if max_train_steps:
            base_args += f" \\\n            --max_train_steps={max_train_steps}"
        else:
            base_args += f" \\\n            --max_train_epochs={num_epochs}"

        # LR scheduler
        lr_scheduler = model_cfg.get("lr_scheduler", "cosine_with_restarts")
        base_args += f" \\\n            --lr_scheduler={lr_scheduler}"

        if model_type == "flux2":
            # FLUX.2 training via musubi-tuner
            flux2_dir = "/workspace/models/FLUX.2-dev"
            dit_path = f"{flux2_dir}/flux2-dev.safetensors"
            vae_path = f"{flux2_dir}/ae.safetensors"
            te_path = f"{flux2_dir}/text_encoder/model-00001-of-00010.safetensors"

            network_mod = model_cfg.get("network_module", "networks.lora_flux_2")
            ts_sampling = model_cfg.get("timestep_sampling", "flux2_shift")
            lr_scheduler = model_cfg.get("lr_scheduler", "cosine")

            # Build as list of args to avoid shell escaping issues
            args = [
                "cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
                "accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16",
                "src/musubi_tuner/flux_2_train_network.py",
                "--model_version dev",
                f"--dit {dit_path}",
                f"--vae {vae_path}",
                f"--text_encoder {te_path}",
                "--dataset_config /workspace/dataset.toml",
                "--sdpa --mixed_precision bf16",
                f"--timestep_sampling {ts_sampling} --weighting_scheme none",
                f"--network_module {network_mod}",
                f"--network_dim={network_rank}",
                f"--network_alpha={network_alpha}",
                "--gradient_checkpointing",
            ]

            # Only use fp8_base on GPUs with native fp8 support (RTX 4090, H100)
            # A100 and A6000 don't support fp8 tensor ops, and have enough VRAM without it
            if gpu_type and ("4090" in gpu_type or "5090" in gpu_type or "L40S" in gpu_type or "H100" in gpu_type):
                args.append("--fp8_base")

            # Handle Prodigy optimizer (needs special class path and args)
            if optimizer.lower() == "prodigy":
                args.extend([
                    "--optimizer_type=prodigyopt.Prodigy",
                    f"--learning_rate={learning_rate}",
                    '--optimizer_args "weight_decay=0.01" "decouple=True" "use_bias_correction=True" "safeguard_warmup=True" "d_coef=2"',
                ])
            else:
                args.extend([
                    f"--optimizer_type={optimizer}",
                    f"--learning_rate={learning_rate}",
                ])

            args.extend([
                "--seed=42",
                '--output_dir=/workspace/output',
                f'--output_name={name}',
                f"--lr_scheduler={lr_scheduler}",
            ])

            if max_train_steps:
                args.append(f"--max_train_steps={max_train_steps}")
                if save_every_n_steps:
                    args.append(f"--save_every_n_steps={save_every_n_steps}")
                else:
                    args.append(f"--save_every_n_epochs={save_every_n_epochs}")
            else:
                args.append(f"--max_train_epochs={num_epochs}")
                args.append(f"--save_every_n_epochs={save_every_n_epochs}")

            return " ".join(args) + " 2>&1"

        elif model_type == "wan22":
            # WAN 2.2 T2V LoRA training via musubi-tuner
            wan_dir = "/workspace/models/WAN2.2"
            dit_low = f"{wan_dir}/wan2.2_t2v_low_noise_14B_fp16.safetensors"
            dit_high = f"{wan_dir}/wan2.2_t2v_high_noise_14B_fp16.safetensors"

            network_mod = model_cfg.get("network_module", "networks.lora_wan")
            ts_sampling = model_cfg.get("timestep_sampling", "shift")
            discrete_shift = model_cfg.get("discrete_flow_shift", 5.0)

            args = [
                "cd /workspace/musubi-tuner && PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
                "accelerate launch --num_cpu_threads_per_process 1 --mixed_precision fp16",
                "src/musubi_tuner/wan_train_network.py",
                "--task t2v-A14B",
                f"--dit {dit_low}",
                f"--dit_high_noise {dit_high}",
                "--dataset_config /workspace/dataset.toml",
                "--sdpa --mixed_precision fp16",
                "--gradient_checkpointing",
                f"--timestep_sampling {ts_sampling}",
                f"--discrete_flow_shift {discrete_shift}",
                f"--network_module {network_mod}",
                f"--network_dim={network_rank}",
                f"--network_alpha={network_alpha}",
                f"--optimizer_type={optimizer}",
                f"--learning_rate={learning_rate}",
                "--seed=42",
                "--output_dir=/workspace/output",
                f"--output_name={name}",
            ]

            if max_train_steps:
                args.append(f"--max_train_steps={max_train_steps}")
                if save_every_n_steps:
                    args.append(f"--save_every_n_steps={save_every_n_steps}")
            else:
                args.append(f"--max_train_epochs={num_epochs}")
                args.append(f"--save_every_n_epochs={save_every_n_epochs}")

            return " ".join(args) + " 2>&1"

        elif model_type == "flux":
            # FLUX.1 training via sd-scripts
            script = "flux_train_network.py"
            flux_args = f"""
            --pretrained_model_name_or_path="{model_path}" \
            --clip_l="/workspace/models/clip_l.safetensors" \
            --t5xxl="/workspace/models/t5xxl_fp16.safetensors" \
            --ae="/workspace/models/ae.safetensors" \
            --cache_text_encoder_outputs \
            --cache_text_encoder_outputs_to_disk \
            --fp8_base \
            --split_mode"""

            # Text encoder learning rate
            text_encoder_lr = model_cfg.get("text_encoder_lr", 4e-5)
            flux_args += f" \\\n            --text_encoder_lr={text_encoder_lr}"

            # Min SNR gamma if specified
            min_snr = model_cfg.get("min_snr_gamma")
            if min_snr:
                flux_args += f" \\\n            --min_snr_gamma={min_snr}"

            return f"cd /workspace/sd-scripts && accelerate launch --num_cpu_threads_per_process 1 {script} {flux_args} {base_args} 2>&1"

        elif model_type == "sdxl":
            # SDXL-specific training
            script = "sdxl_train_network.py"
            clip_skip = model_cfg.get("clip_skip", 2)
            return f"""cd /workspace/sd-scripts && accelerate launch --num_cpu_threads_per_process 1 {script} \
            --pretrained_model_name_or_path="{model_path}" \
            --clip_skip={clip_skip} \
            --xformers {base_args} 2>&1"""

        else:
            # SD 1.5 / default training
            script = "train_network.py"
            clip_skip = model_cfg.get("clip_skip", 1)
            return f"""cd /workspace/sd-scripts && accelerate launch --num_cpu_threads_per_process 1 {script} \
            --pretrained_model_name_or_path="{model_path}" \
            --clip_skip={clip_skip} \
            --xformers {base_args} 2>&1"""

    async def _wait_for_pod_ready(self, job: CloudTrainingJob, timeout: int = 600) -> tuple[str, int]:
        """Wait for pod to be running and return (ssh_host, ssh_port)."""
        start = time.time()
        while time.time() - start < timeout:
            try:
                pod = await asyncio.to_thread(runpod.get_pod, job.pod_id)
            except Exception as e:
                job._log(f"  API error: {e}")
                await asyncio.sleep(10)
                continue

            status = pod.get("desiredStatus", "")
            runtime = pod.get("runtime")

            if status == "RUNNING" and runtime:
                ports = runtime.get("ports") or []
                for port_info in (ports or []):
                    if port_info.get("privatePort") == 22:
                        ip = port_info.get("ip")
                        public_port = port_info.get("publicPort")
                        if ip and public_port:
                            return ip, int(public_port)

            elapsed = int(time.time() - start)
            if elapsed % 30 < 6:
                job._log(f"  Status: {status} | runtime: {'ports pending' if runtime else 'not ready yet'} ({elapsed}s)")

            await asyncio.sleep(5)

        raise RuntimeError(f"Pod did not become ready within {timeout}s")

    def _ssh_exec_sync(self, ssh, cmd: str, timeout: int = 120) -> str:
        """Execute a command over SSH and return stdout (blocking)."""
        _, stdout, stderr = ssh.exec_command(cmd, timeout=timeout)
        out = stdout.read().decode("utf-8", errors="replace")
        err = stderr.read().decode("utf-8", errors="replace")
        exit_code = stdout.channel.recv_exit_status()
        if exit_code != 0 and "warning" not in err.lower():
            logger.warning("SSH cmd failed (code %d): %s\nstderr: %s", exit_code, cmd[:100], err[:500])
        return out.strip()

    async def _ssh_exec(self, ssh, cmd: str, timeout: int = 120) -> str:
        """Execute a command over SSH without blocking the event loop."""
        return await asyncio.to_thread(self._ssh_exec_sync, ssh, cmd, timeout)

    def _parse_progress(self, job: CloudTrainingJob, line: str):
        """Parse Kohya training output for progress info."""
        lower = line.lower()
        if "epoch" in lower and "/" in line:
            try:
                parts = lower.split("epoch")
                if len(parts) > 1:
                    ep_part = parts[1].strip().split()[0]
                    if "/" in ep_part:
                        current, total = ep_part.split("/")
                        job.current_epoch = int(current)
                        job.total_epochs = int(total)
                        job.progress = 0.15 + 0.75 * (job.current_epoch / max(job.total_epochs, 1))
            except (ValueError, IndexError):
                pass

        if "loss=" in line or "loss:" in line:
            try:
                loss_str = line.split("loss")[1].strip("=: ").split()[0].strip(",")
                job.loss = float(loss_str)
            except (ValueError, IndexError):
                pass

        if "steps:" in lower or "step " in lower:
            try:
                import re
                step_match = re.search(r"(\d+)/(\d+)", line)
                if step_match:
                    job.current_step = int(step_match.group(1))
                    job.total_steps = int(step_match.group(2))
            except (ValueError, IndexError):
                pass

    def get_job(self, job_id: str) -> CloudTrainingJob | None:
        return self._jobs.get(job_id)

    def list_jobs(self) -> list[CloudTrainingJob]:
        return list(self._jobs.values())

    async def cancel_job(self, job_id: str) -> bool:
        """Cancel a cloud training job and terminate its pod."""
        job = self._jobs.get(job_id)
        if not job:
            return False
        if job.pod_id:
            try:
                await asyncio.to_thread(runpod.terminate_pod, job.pod_id)
            except Exception:
                pass
        job.status = "failed"
        job.error = "Cancelled by user"
        return True

    async def delete_job(self, job_id: str) -> bool:
        """Delete a training job from memory and database."""
        if job_id not in self._jobs:
            return False
        del self._jobs[job_id]
        try:
            async with catalog_session_factory() as session:
                result = await session.execute(
                    __import__('sqlalchemy').select(TrainingJobDB).where(TrainingJobDB.id == job_id)
                )
                db_job = result.scalar_one_or_none()
                if db_job:
                    await session.delete(db_job)
                    await session.commit()
        except Exception as e:
            logger.warning("Failed to delete job from DB: %s", e)
        return True

    async def delete_failed_jobs(self) -> int:
        """Delete all failed/error training jobs."""
        failed_ids = [jid for jid, j in self._jobs.items() if j.status in ("failed", "error")]
        for jid in failed_ids:
            await self.delete_job(jid)
        return len(failed_ids)