Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 22 days ago

Commit

917e4ed

1 Parent(s): 04c031f

add GPU/CUDA auto-detect, mixed precision, flash_attn, txt caption parser

Browse files

Files changed (1) hide show

train_engine.py +299 -44

train_engine.py CHANGED Viewed

@@ -1,10 +1,16 @@
 """
-Standalone ACE-Step CPU LoRA Training Engine.
 Ported from Side-Step (koda-dernet/Side-Step) into a single self-contained
 module. No external Side-Step dependency required.
 Exports:
     preprocess_audio()       - 2-pass sequential preprocessing
     train_lora_generator()   - Generator-based LoRA training loop
     cancel_training()        - Set the cancel flag
@@ -63,6 +69,93 @@ def cancel_training() -> None:
     _training_cancel.set()
 # ============================================================================
 # CONFIGS
 # ============================================================================
@@ -448,7 +541,12 @@ def _ensure_acestep_imports():
 def _attn_candidates(device: str) -> List[str]:
-    """FA2 -> SDPA -> eager, filtered by availability."""
     candidates = []
     if device.startswith("cuda"):
         try:
@@ -457,8 +555,21 @@ def _attn_candidates(device: str) -> List[str]:
             props = torch.cuda.get_device_properties(dev_idx)
             if props.major >= 8:
                 candidates.append("flash_attention_2")
-        except (ImportError, Exception):
-            pass
     candidates.extend(["sdpa", "eager"])
     return candidates
@@ -469,8 +580,12 @@ def load_model_for_training(
     from transformers import AutoModel
     model_dir = _resolve_model_dir(checkpoint_dir, variant)
-    # CPU always uses float32
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     _ensure_acestep_imports()
@@ -489,7 +604,7 @@ def load_model_for_training(
             if device != "cpu":
                 load_kwargs["device_map"] = {"": device}
             model = AutoModel.from_pretrained(str(model_dir), **load_kwargs)
-            logger.info("Model loaded with attn_implementation=%s", attn)
             break
         except Exception as exc:
             err_text = str(exc)
@@ -499,11 +614,23 @@ def load_model_for_training(
                     f"  Original error: {err_text}"
                 ) from exc
             last_err = exc
-            logger.warning("attn backend '%s' failed: %s", attn, exc)
     if model is None:
         raise RuntimeError(f"Failed to load model from {model_dir}: {last_err}") from last_err
     for param in model.parameters():
         param.requires_grad = False
     model.eval()
@@ -517,10 +644,11 @@ def load_vae(checkpoint_dir: str, device: str = "cpu"):
     if not vae_path.is_dir():
         raise FileNotFoundError(f"VAE directory not found: {vae_path}")
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     vae = AutoencoderOobleck.from_pretrained(str(vae_path), torch_dtype=dtype)
     vae = vae.to(device=device)
     vae.eval()
     return vae
@@ -531,11 +659,12 @@ def load_text_encoder(checkpoint_dir: str, device: str = "cpu"):
     if not text_path.is_dir():
         raise FileNotFoundError(f"Text encoder not found: {text_path}")
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     tokenizer = AutoTokenizer.from_pretrained(str(text_path))
     encoder = AutoModel.from_pretrained(str(text_path), torch_dtype=dtype)
     encoder = encoder.to(device=device)
     encoder.eval()
     return tokenizer, encoder
@@ -543,7 +672,7 @@ def load_silence_latent(
     checkpoint_dir: str, device: str = "cpu", variant: str = "base",
 ) -> torch.Tensor:
     ckpt = Path(checkpoint_dir)
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     candidates = [ckpt / "silence_latent.pt"]
     subdir = _VARIANT_DIR.get(variant)
@@ -571,6 +700,14 @@ def unload_models(*models) -> None:
                 pass
         del obj
     gc.collect()
 # ============================================================================
@@ -1904,16 +2041,44 @@ def _write_caption_sidecar(audio_path: Path, analysis: Dict[str, Any]) -> Path:
     return sidecar_path
 def _read_caption_sidecar(audio_path: Path) -> Optional[Dict[str, Any]]:
-    """Read an existing .json caption sidecar if it exists."""
-    sidecar_path = audio_path.with_suffix(".json")
-    if not sidecar_path.is_file():
-        return None
-    try:
-        with open(sidecar_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    except Exception:
-        return None
 # ============================================================================
@@ -1924,7 +2089,7 @@ def preprocess_audio(
     audio_dir: str,
     output_dir: str,
     checkpoint_dir: str,
-    device: str = "cpu",
     variant: str = "base",
     max_duration: float = 0,
     progress_callback: Optional[Callable] = None,
@@ -1934,7 +2099,13 @@ def preprocess_audio(
     Pass 1: Load VAE + text encoder, encode audio + text, save intermediates.
     Pass 2: Load DIT model, run encoder, build context, save final .pt files.
     """
     out = Path(output_dir)
     out.mkdir(parents=True, exist_ok=True)
@@ -1954,7 +2125,7 @@ def preprocess_audio(
     if max_duration <= 0:
         max_duration = _detect_max_duration(audio_files)
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     # ---- Pass 1: VAE + Text Encoder ----
     logger.info("Pass 1/2: Loading VAE + Text Encoder...")
@@ -2086,6 +2257,7 @@ def preprocess_audio(
     finally:
         logger.info("Unloading VAE + Text Encoder...")
         unload_models(vae, text_enc, tokenizer, silence_lat)
     # ---- Pass 2: DIT Encoder ----
     if not intermediates:
@@ -2162,6 +2334,7 @@ def preprocess_audio(
     finally:
         logger.info("Unloading DIT model...")
         unload_models(model)
     failed = p1_failed + p2_failed
     return {"processed": processed, "failed": failed, "total": total, "output_dir": str(out)}
@@ -2188,7 +2361,7 @@ def train_lora_generator(
     save_every_n_epochs: int = 0,
     seed: int = 42,
     variant: str = "base",
-    device: str = "cpu",
     cfg_ratio: float = 0.15,
     timestep_mu: float = -0.4,
     timestep_sigma: float = 1.0,
@@ -2200,10 +2373,20 @@ def train_lora_generator(
     This is a generator for Gradio live-update compatibility.
     Call cancel_training() to stop after the current epoch.
     """
     _training_cancel.clear()
     train_start = time.time()
     if target_modules is None:
         target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
@@ -2215,6 +2398,13 @@ def train_lora_generator(
     out_path = Path(output_dir)
     out_path.mkdir(parents=True, exist_ok=True)
     yield "[INFO] Loading model..."
     try:
@@ -2223,10 +2413,14 @@ def train_lora_generator(
         yield f"[FAIL] Model load failed: {exc}"
         return
-    # float32 on CPU (bfloat16 deadlocks)
-    dtype = CPU_DTYPE if device == "cpu" else torch.bfloat16
     model = model.to(dtype=dtype)
     yield "[INFO] Injecting LoRA..."
     lora_cfg = LoRAConfig(
@@ -2262,11 +2456,14 @@ def train_lora_generator(
     loader = DataLoader(
         dataset, batch_size=batch_size, shuffle=True,
         num_workers=0, collate_fn=_collate_batch, drop_last=False,
     )
     # Optimizer & scheduler
     torch.manual_seed(seed)
     random.seed(seed)
     trainable_params = [p for p in model.parameters() if p.requires_grad]
     if not trainable_params:
@@ -2282,6 +2479,13 @@ def train_lora_generator(
     yield f"[INFO] Training {sum(p.numel() for p in trainable_params):,} params for {epochs} epochs"
     yield f"[INFO] Steps/epoch: {steps_per_epoch}, total: {total_steps}"
     # Null condition embedding for CFG dropout
     null_cond = getattr(model, "null_condition_emb", None)
@@ -2350,7 +2554,9 @@ def train_lora_generator(
             model.decoder.train()
             yield f"[OK] Cancelled at epoch {epoch + 1}, saved to {early_path}"
             yield "[DONE]"
             unload_models(model)
             return
         # Timeout check
@@ -2361,7 +2567,9 @@ def train_lora_generator(
             save_lora_adapter(model, early_path)
             yield f"[WARN] Training timed out after {int(elapsed)}s, saved to {early_path}"
             yield "[DONE]"
             unload_models(model)
             return
         epoch_loss = 0.0
@@ -2369,8 +2577,8 @@ def train_lora_generator(
         epoch_start = time.time()
         for batch in loader:
-            # Forward
-            nb = device != "cpu"
             tgt = batch["target_latents"].to(device, dtype=dtype, non_blocking=nb)
             att = batch["attention_mask"].to(device, dtype=dtype, non_blocking=nb)
             enc_hs = batch["encoder_hidden_states"].to(device, dtype=dtype, non_blocking=nb)
@@ -2395,19 +2603,34 @@ def train_lora_generator(
             if force_input_grads:
                 xt = xt.requires_grad_(True)
-            # Decoder forward
-            dec_out = model.decoder(
-                hidden_states=xt,
-                timestep=t,
-                timestep_r=t,
-                attention_mask=att,
-                encoder_hidden_states=enc_hs,
-                encoder_attention_mask=enc_mask,
-                context_latents=ctx,
-            )
-            flow = x1 - x0
-            loss = F.mse_loss(dec_out[0], flow)
             loss = loss.float()  # fp32 for stable backward
             # NaN guard
@@ -2416,7 +2639,9 @@ def train_lora_generator(
                 del loss, tgt, att, enc_hs, enc_mask, ctx, xt, dec_out, flow
                 if consecutive_nan >= MAX_NAN:
                     yield f"[FAIL] {consecutive_nan} consecutive NaN losses, halting"
                     unload_models(model)
                     return
                 if acc_step > 0:
                     optimizer.zero_grad(set_to_none=True)
@@ -2426,14 +2651,27 @@ def train_lora_generator(
             consecutive_nan = 0
             loss = loss / gradient_accumulation_steps
-            loss.backward()
             acc_loss += loss.item()
             del loss, tgt, att, enc_hs, enc_mask, ctx, xt, dec_out, flow
             acc_step += 1
             if acc_step >= gradient_accumulation_steps:
-                torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
-                optimizer.step()
                 scheduler.step()
                 global_step += 1
@@ -2454,10 +2692,20 @@ def train_lora_generator(
                 acc_loss = 0.0
                 acc_step = 0
         # Flush remainder
         if acc_step > 0:
-            torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
-            optimizer.step()
             scheduler.step()
             global_step += 1
             avg_loss = acc_loss * gradient_accumulation_steps / acc_step
@@ -2506,10 +2754,15 @@ def train_lora_generator(
             model.decoder.train()
             yield f"[OK] Checkpoint saved at epoch {epoch + 1}"
     # Sanity check
     if global_step == 0:
         yield "[FAIL] Training completed 0 steps -- no batches processed"
         unload_models(model)
         return
     # Final save (directly to output_dir, not a subdirectory)
@@ -2525,7 +2778,9 @@ def train_lora_generator(
         f"  Adapter ready for inference."
     )
     yield "[DONE]"
     unload_models(model)
 # ============================================================================

 """
+Standalone ACE-Step LoRA Training Engine (CPU + GPU).
 Ported from Side-Step (koda-dernet/Side-Step) into a single self-contained
 module. No external Side-Step dependency required.
+Auto-detects GPU (CUDA > MPS > CPU) and uses it when available,
+falling back to CPU.  bfloat16 is used on GPU; float32 is forced
+on CPU (bfloat16 deadlocks on CPU -- known PyTorch bug).
 Exports:
+    detect_device()          - Auto-detect best available device
+    select_dtype()           - Pick dtype for a device
     preprocess_audio()       - 2-pass sequential preprocessing
     train_lora_generator()   - Generator-based LoRA training loop
     cancel_training()        - Set the cancel flag
     _training_cancel.set()
+# ============================================================================
+# DEVICE DETECTION & DTYPE SELECTION
+# ============================================================================
+def detect_device(requested: str = "auto") -> str:
+    """Return the best available device string.
+    Priority: CUDA (best GPU by VRAM) > MPS (Apple Silicon) > CPU.
+    Pass an explicit device string (e.g. "cuda:0", "cpu") to skip
+    auto-detection.
+    """
+    if requested != "auto":
+        return requested
+    if torch.cuda.is_available():
+        # Pick the GPU with the most VRAM when multiple are present
+        count = torch.cuda.device_count()
+        if count <= 1:
+            best_idx = 0
+        else:
+            best_idx, best_mem = 0, 0
+            for i in range(count):
+                mem = torch.cuda.get_device_properties(i).total_memory
+                if mem > best_mem:
+                    best_idx, best_mem = i, mem
+            if best_idx != 0:
+                logger.info(
+                    "Multiple CUDA devices (%d). Selected cuda:%d (%s, %.0f MiB).",
+                    count, best_idx,
+                    torch.cuda.get_device_name(best_idx),
+                    best_mem / (1024 ** 2),
+                )
+        device = f"cuda:{best_idx}"
+        logger.info("Auto-detected device: %s (%s)", device, torch.cuda.get_device_name(best_idx))
+        return device
+    if hasattr(torch, "mps") and hasattr(torch.mps, "is_available") and torch.mps.is_available():
+        logger.info("Auto-detected device: mps (Apple Silicon)")
+        return "mps"
+    logger.info("Auto-detected device: cpu")
+    return "cpu"
+def select_dtype(device: str) -> torch.dtype:
+    """Select the appropriate training dtype for *device*.
+    GPU: bfloat16 if supported, else float16.
+    CPU: MUST stay float32 (bfloat16 deadlocks on CPU).
+    """
+    dev_type = device.split(":")[0]
+    if dev_type == "cpu":
+        return CPU_DTYPE  # always float32
+    if dev_type == "cuda":
+        # Prefer bfloat16 on Ampere+ (compute capability >= 8.0)
+        try:
+            idx = int(device.split(":")[1]) if ":" in device else 0
+            props = torch.cuda.get_device_properties(idx)
+            if props.major >= 8:
+                return torch.bfloat16
+        except Exception:
+            pass
+        return torch.float16
+    # MPS / other accelerators -- float32 is safest
+    if dev_type == "mps":
+        return torch.float32
+    return CPU_DTYPE
+def _cuda_sync(device: str) -> None:
+    """Synchronize CUDA if the device is a CUDA device (no-op otherwise)."""
+    if device.startswith("cuda") and torch.cuda.is_available():
+        torch.cuda.synchronize()
+def _clear_gpu_cache(device: str) -> None:
+    """Free cached GPU memory for the given device type."""
+    dev_type = device.split(":")[0]
+    if dev_type == "cuda" and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    elif dev_type == "mps" and hasattr(torch, "mps") and torch.mps.is_available():
+        torch.mps.empty_cache()
 # ============================================================================
 # CONFIGS
 # ============================================================================
 def _attn_candidates(device: str) -> List[str]:
+    """FA2 -> SDPA -> eager, filtered by availability.
+    On CUDA with flash_attn installed and compute capability >= 8.0,
+    flash_attention_2 is tried first.  On CPU, flash_attention_2 is
+    always skipped (it requires CUDA).
+    """
     candidates = []
     if device.startswith("cuda"):
         try:
             props = torch.cuda.get_device_properties(dev_idx)
             if props.major >= 8:
                 candidates.append("flash_attention_2")
+                logger.info(
+                    "flash_attention_2 available (compute %d.%d, flash_attn installed)",
+                    props.major, props.minor,
+                )
+            else:
+                logger.info(
+                    "flash_attention_2 skipped: compute %d.%d < 8.0 (need Ampere+)",
+                    props.major, props.minor,
+                )
+        except ImportError:
+            logger.info("flash_attention_2 skipped: flash_attn package not installed")
+        except Exception as exc:
+            logger.info("flash_attention_2 skipped: %s", exc)
+    else:
+        logger.info("flash_attention_2 skipped: device is %s (not CUDA)", device)
     candidates.extend(["sdpa", "eager"])
     return candidates
     from transformers import AutoModel
     model_dir = _resolve_model_dir(checkpoint_dir, variant)
+    dtype = select_dtype(device)
+    logger.info(
+        "Loading model from %s (variant=%s, device=%s, dtype=%s)",
+        model_dir, variant, device, dtype,
+    )
     _ensure_acestep_imports()
             if device != "cpu":
                 load_kwargs["device_map"] = {"": device}
             model = AutoModel.from_pretrained(str(model_dir), **load_kwargs)
+            logger.info("Model loaded with attn_implementation=%s on %s", attn, device)
             break
         except Exception as exc:
             err_text = str(exc)
                     f"  Original error: {err_text}"
                 ) from exc
             last_err = exc
+            next_attn = candidates[idx + 1] if idx + 1 < len(candidates) else None
+            if next_attn:
+                logger.warning("attn backend '%s' failed: %s; trying '%s'", attn, exc, next_attn)
+            else:
+                logger.warning("attn backend '%s' failed: %s", attn, exc)
     if model is None:
         raise RuntimeError(f"Failed to load model from {model_dir}: {last_err}") from last_err
+    # If device_map was not used (CPU), move model explicitly
+    if device != "cpu":
+        # device_map already placed weights; just verify dtype
+        if any(p.dtype != dtype for p in model.parameters()):
+            model = model.to(dtype=dtype)
+    else:
+        model = model.to(device=device, dtype=dtype)
     for param in model.parameters():
         param.requires_grad = False
     model.eval()
     if not vae_path.is_dir():
         raise FileNotFoundError(f"VAE directory not found: {vae_path}")
+    dtype = select_dtype(device)
     vae = AutoencoderOobleck.from_pretrained(str(vae_path), torch_dtype=dtype)
     vae = vae.to(device=device)
     vae.eval()
+    logger.info("VAE loaded on %s (dtype=%s)", device, dtype)
     return vae
     if not text_path.is_dir():
         raise FileNotFoundError(f"Text encoder not found: {text_path}")
+    dtype = select_dtype(device)
     tokenizer = AutoTokenizer.from_pretrained(str(text_path))
     encoder = AutoModel.from_pretrained(str(text_path), torch_dtype=dtype)
     encoder = encoder.to(device=device)
     encoder.eval()
+    logger.info("Text encoder loaded on %s (dtype=%s)", device, dtype)
     return tokenizer, encoder
     checkpoint_dir: str, device: str = "cpu", variant: str = "base",
 ) -> torch.Tensor:
     ckpt = Path(checkpoint_dir)
+    dtype = select_dtype(device)
     candidates = [ckpt / "silence_latent.pt"]
     subdir = _VARIANT_DIR.get(variant)
                 pass
         del obj
     gc.collect()
+    # Free GPU memory after unloading
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if hasattr(torch, "mps") and hasattr(torch.mps, "is_available") and torch.mps.is_available():
+        try:
+            torch.mps.empty_cache()
+        except Exception:
+            pass
 # ============================================================================
     return sidecar_path
+def _parse_txt_caption(text: str) -> Dict[str, Any]:
+    """Parse user's .txt caption format into structured fields."""
+    result: Dict[str, Any] = {}
+    lyrics_match = re.search(r'lyrics say "(.*?)" at tempo', text, re.DOTALL)
+    if lyrics_match:
+        result["lyrics"] = lyrics_match.group(1).strip()
+        caption_part = text[:lyrics_match.start()].strip().rstrip(",").strip()
+    else:
+        result["lyrics"] = "[Instrumental]"
+        caption_part = text.strip()
+    bpm_match = re.search(r'at tempo (\d+) BPM', text)
+    if bpm_match:
+        result["bpm"] = bpm_match.group(1)
+        caption_part = re.sub(r'\s*at tempo \d+ BPM.*', '', caption_part).strip()
+    key_match = re.search(r'in the key of ([A-G][#b]?[-\d]*)', text)
+    if key_match:
+        result["key"] = key_match.group(1)
+    result["caption"] = caption_part if caption_part else text[:200]
+    return result
 def _read_caption_sidecar(audio_path: Path) -> Optional[Dict[str, Any]]:
+    """Read .json or .txt caption sidecar."""
+    json_path = audio_path.with_suffix(".json")
+    if json_path.is_file():
+        try:
+            with open(json_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except Exception:
+            pass
+    txt_path = audio_path.with_suffix(".txt")
+    if txt_path.is_file():
+        try:
+            with open(txt_path, "r", encoding="utf-8") as f:
+                return _parse_txt_caption(f.read())
+        except Exception:
+            pass
+    return None
 # ============================================================================
     audio_dir: str,
     output_dir: str,
     checkpoint_dir: str,
+    device: str = "auto",
     variant: str = "base",
     max_duration: float = 0,
     progress_callback: Optional[Callable] = None,
     Pass 1: Load VAE + text encoder, encode audio + text, save intermediates.
     Pass 2: Load DIT model, run encoder, build context, save final .pt files.
+    Args:
+        device: "auto" to auto-detect GPU/CPU, or explicit device string.
     """
+    device = detect_device(device)
+    logger.info("Preprocessing on device: %s", device)
     out = Path(output_dir)
     out.mkdir(parents=True, exist_ok=True)
     if max_duration <= 0:
         max_duration = _detect_max_duration(audio_files)
+    dtype = select_dtype(device)
     # ---- Pass 1: VAE + Text Encoder ----
     logger.info("Pass 1/2: Loading VAE + Text Encoder...")
     finally:
         logger.info("Unloading VAE + Text Encoder...")
         unload_models(vae, text_enc, tokenizer, silence_lat)
+        _clear_gpu_cache(device)
     # ---- Pass 2: DIT Encoder ----
     if not intermediates:
     finally:
         logger.info("Unloading DIT model...")
         unload_models(model)
+        _clear_gpu_cache(device)
     failed = p1_failed + p2_failed
     return {"processed": processed, "failed": failed, "total": total, "output_dir": str(out)}
     save_every_n_epochs: int = 0,
     seed: int = 42,
     variant: str = "base",
+    device: str = "auto",
     cfg_ratio: float = 0.15,
     timestep_mu: float = -0.4,
     timestep_sigma: float = 1.0,
     This is a generator for Gradio live-update compatibility.
     Call cancel_training() to stop after the current epoch.
+    Args:
+        device: "auto" to auto-detect GPU/CPU, or explicit device string.
+                GPU uses mixed-precision (bfloat16/float16); CPU stays float32.
     """
     _training_cancel.clear()
     train_start = time.time()
+    # Auto-detect device
+    device = detect_device(device)
+    dtype = select_dtype(device)
+    dev_type = device.split(":")[0]
+    use_amp = dev_type == "cuda"
     if target_modules is None:
         target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
     out_path = Path(output_dir)
     out_path.mkdir(parents=True, exist_ok=True)
+    yield f"[INFO] Device: {device}, dtype: {dtype}, AMP: {use_amp}"
+    if dev_type == "cuda":
+        gpu_name = torch.cuda.get_device_name(device)
+        gpu_mem = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3)
+        yield f"[INFO] GPU: {gpu_name} ({gpu_mem:.1f} GiB VRAM)"
     yield "[INFO] Loading model..."
     try:
         yield f"[FAIL] Model load failed: {exc}"
         return
+    # Ensure model is in the correct dtype (load_model_for_training handles this,
+    # but be explicit for safety)
     model = model.to(dtype=dtype)
+    # Move model to device if not already there (CPU path)
+    if dev_type == "cpu":
+        model = model.to(device=device)
     yield "[INFO] Injecting LoRA..."
     lora_cfg = LoRAConfig(
     loader = DataLoader(
         dataset, batch_size=batch_size, shuffle=True,
         num_workers=0, collate_fn=_collate_batch, drop_last=False,
+        pin_memory=(dev_type == "cuda"),
     )
     # Optimizer & scheduler
     torch.manual_seed(seed)
     random.seed(seed)
+    if dev_type == "cuda":
+        torch.cuda.manual_seed_all(seed)
     trainable_params = [p for p in model.parameters() if p.requires_grad]
     if not trainable_params:
     yield f"[INFO] Training {sum(p.numel() for p in trainable_params):,} params for {epochs} epochs"
     yield f"[INFO] Steps/epoch: {steps_per_epoch}, total: {total_steps}"
+    # GradScaler for mixed precision on GPU (only for float16, not bfloat16)
+    use_grad_scaler = use_amp and dtype == torch.float16
+    grad_scaler = None
+    if use_grad_scaler:
+        grad_scaler = torch.cuda.amp.GradScaler()
+        yield "[INFO] GradScaler enabled (float16 mixed precision)"
     # Null condition embedding for CFG dropout
     null_cond = getattr(model, "null_condition_emb", None)
             model.decoder.train()
             yield f"[OK] Cancelled at epoch {epoch + 1}, saved to {early_path}"
             yield "[DONE]"
+            _cuda_sync(device)
             unload_models(model)
+            _clear_gpu_cache(device)
             return
         # Timeout check
             save_lora_adapter(model, early_path)
             yield f"[WARN] Training timed out after {int(elapsed)}s, saved to {early_path}"
             yield "[DONE]"
+            _cuda_sync(device)
             unload_models(model)
+            _clear_gpu_cache(device)
             return
         epoch_loss = 0.0
         epoch_start = time.time()
         for batch in loader:
+            # Move batch tensors to device
+            nb = dev_type != "cpu"
             tgt = batch["target_latents"].to(device, dtype=dtype, non_blocking=nb)
             att = batch["attention_mask"].to(device, dtype=dtype, non_blocking=nb)
             enc_hs = batch["encoder_hidden_states"].to(device, dtype=dtype, non_blocking=nb)
             if force_input_grads:
                 xt = xt.requires_grad_(True)
+            # Decoder forward -- use AMP autocast on GPU for mixed precision
+            if use_amp:
+                with torch.cuda.amp.autocast(dtype=dtype):
+                    dec_out = model.decoder(
+                        hidden_states=xt,
+                        timestep=t,
+                        timestep_r=t,
+                        attention_mask=att,
+                        encoder_hidden_states=enc_hs,
+                        encoder_attention_mask=enc_mask,
+                        context_latents=ctx,
+                    )
+                    flow = x1 - x0
+                    loss = F.mse_loss(dec_out[0], flow)
+            else:
+                # CPU path -- no autocast
+                dec_out = model.decoder(
+                    hidden_states=xt,
+                    timestep=t,
+                    timestep_r=t,
+                    attention_mask=att,
+                    encoder_hidden_states=enc_hs,
+                    encoder_attention_mask=enc_mask,
+                    context_latents=ctx,
+                )
+                flow = x1 - x0
+                loss = F.mse_loss(dec_out[0], flow)
             loss = loss.float()  # fp32 for stable backward
             # NaN guard
                 del loss, tgt, att, enc_hs, enc_mask, ctx, xt, dec_out, flow
                 if consecutive_nan >= MAX_NAN:
                     yield f"[FAIL] {consecutive_nan} consecutive NaN losses, halting"
+                    _cuda_sync(device)
                     unload_models(model)
+                    _clear_gpu_cache(device)
                     return
                 if acc_step > 0:
                     optimizer.zero_grad(set_to_none=True)
             consecutive_nan = 0
             loss = loss / gradient_accumulation_steps
+            # Backward -- use GradScaler on float16 GPU
+            if grad_scaler is not None:
+                grad_scaler.scale(loss).backward()
+            else:
+                loss.backward()
             acc_loss += loss.item()
             del loss, tgt, att, enc_hs, enc_mask, ctx, xt, dec_out, flow
             acc_step += 1
             if acc_step >= gradient_accumulation_steps:
+                if grad_scaler is not None:
+                    grad_scaler.unscale_(optimizer)
+                    torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
+                    grad_scaler.step(optimizer)
+                    grad_scaler.update()
+                else:
+                    torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
+                    optimizer.step()
                 scheduler.step()
                 global_step += 1
                 acc_loss = 0.0
                 acc_step = 0
+                # Periodic GPU cache cleanup
+                if dev_type == "cuda" and global_step % log_every == 0:
+                    torch.cuda.empty_cache()
         # Flush remainder
         if acc_step > 0:
+            if grad_scaler is not None:
+                grad_scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
+                grad_scaler.step(optimizer)
+                grad_scaler.update()
+            else:
+                torch.nn.utils.clip_grad_norm_(trainable_params, max_grad_norm)
+                optimizer.step()
             scheduler.step()
             global_step += 1
             avg_loss = acc_loss * gradient_accumulation_steps / acc_step
             model.decoder.train()
             yield f"[OK] Checkpoint saved at epoch {epoch + 1}"
+        # Clear GPU cache after epoch + checkpoint save
+        _clear_gpu_cache(device)
     # Sanity check
     if global_step == 0:
         yield "[FAIL] Training completed 0 steps -- no batches processed"
+        _cuda_sync(device)
         unload_models(model)
+        _clear_gpu_cache(device)
         return
     # Final save (directly to output_dir, not a subdirectory)
         f"  Adapter ready for inference."
     )
     yield "[DONE]"
+    _cuda_sync(device)
     unload_models(model)
+    _clear_gpu_cache(device)
 # ============================================================================