krystv
/

ArtFlow

Model card Files Files and versions

xet

Community

krystv commited on 16 days ago

Commit

962ddaa

verified ·

1 Parent(s): 3239df1

v2: Training with real datasets, Mamba SSM, push-to-hub

Browse files

Files changed (1) hide show

artflow_train.py +156 -252

artflow_train.py CHANGED Viewed

@@ -1,8 +1,12 @@
 """
-ArtFlow Training Utilities
-===========================
-All training logic — loss functions, optimizers, training engine.
-Import this from the Colab notebook or use standalone.
 Uses only modern, non-deprecated PyTorch APIs.
 """
@@ -12,7 +16,7 @@ import math
 import json
 import time
 from dataclasses import dataclass, asdict
-from typing import Tuple, Optional
 from collections import deque
 import torch
@@ -24,22 +28,11 @@ from artflow_model import (
     ArtFlow, ArtFlowConfig, HaarWavelet2D, logit_normal_timestep
 )
-# ============================================================================
-# Loss: Pseudo-Huber + Min-SNR-γ + Art-Aware Frequency Weighting
-# ============================================================================
 class ArtFlowLoss(nn.Module):
-    """
-    Research-backed loss combining three mechanisms:
-    1. Pseudo-Huber loss — robust to outliers [arXiv:2403.16728]
-    2. Min-SNR-γ weighting — balances timestep learning [arXiv:2303.09556]
-    3. Art-aware frequency weighting — emphasizes line work
-    """
-    def __init__(self, huber_c: float = 0.00054, min_snr_gamma: float = 5.0,
-                 use_pseudo_huber: bool = True, use_min_snr: bool = True,
-                 w_LL: float = 1.0, w_LH: float = 2.0,
-                 w_HL: float = 2.0, w_HH: float = 1.5):
         super().__init__()
         self.huber_c = huber_c
         self.min_snr_gamma = min_snr_gamma
@@ -49,31 +42,19 @@ class ArtFlowLoss(nn.Module):
         self.freq_weights = {'LL': w_LL, 'LH': w_LH, 'HL': w_HL, 'HH': w_HH}
         self.loss_ema = None
-    def pseudo_huber(self, x: torch.Tensor) -> torch.Tensor:
-        """sqrt(x² + c²) - c  — smooth near 0, linear for large |x|."""
         return (x.pow(2) + self.huber_c ** 2).sqrt() - self.huber_c
-    def snr_weight(self, t: torch.Tensor) -> torch.Tensor:
-        """Min-SNR-γ for flow matching: SNR(t) = (1-t)²/t²."""
         snr = ((1 - t) / t.clamp(min=1e-6)).pow(2)
         w = torch.clamp(snr, max=self.min_snr_gamma) / snr.clamp(min=1e-6)
         return w[:, None, None, None]
-    def forward(self, v_pred: torch.Tensor, v_target: torch.Tensor,
-                t: torch.Tensor) -> Tuple[torch.Tensor, bool]:
         error = v_pred - v_target
-        # Element-wise loss
-        if self.use_pseudo_huber:
-            elem = self.pseudo_huber(error)
-        else:
-            elem = error.pow(2)
-        # Per-sample SNR weighting
         if self.use_min_snr:
             elem = elem * self.snr_weight(t)
-        # Frequency-weighted aggregation
         if elem.shape[2] % 2 == 0 and elem.shape[3] % 2 == 0:
             LL, LH, HL, HH = self.wavelet(elem)
             loss = (self.freq_weights['LL'] * LL.mean() +
@@ -82,21 +63,11 @@ class ArtFlowLoss(nn.Module):
                     self.freq_weights['HH'] * HH.mean())
         else:
             loss = elem.mean()
-        # Spike detection
         lv = loss.item()
-        if self.loss_ema is None:
-            self.loss_ema = lv
-        else:
-            self.loss_ema = 0.99 * self.loss_ema + 0.01 * lv
-        is_spike = lv > 10.0 * max(self.loss_ema, 0.01)
-        return loss, is_spike
-# ============================================================================
-# Training Config
-# ============================================================================
 @dataclass
 class TrainConfig:
@@ -105,175 +76,166 @@ class TrainConfig:
     betas: Tuple[float, float] = (0.9, 0.99)
     max_grad_norm: float = 1.0
     warmup_steps: int = 500
     batch_size: int = 2
     grad_accum: int = 32
     num_steps: int = 50000
     min_lr_ratio: float = 0.05
     ema_decay: float = 0.9999
     ema_start_step: int = 1000
     log_every: int = 50
     save_every: int = 2500
     output_dir: str = './artflow_ckpts'
     stage: int = 1
-# ============================================================================
-# Synthetic Dataset (for smoke-tests / Colab)
-# ============================================================================
 class SyntheticDataset(Dataset):
-    """Random latent + text pairs for testing the training loop."""
-    def __init__(self, n: int = 10000, config: ArtFlowConfig = None):
         self.n = n
         self.cfg = config or ArtFlowConfig()
-    def __len__(self):
-        return self.n
     def __getitem__(self, idx):
         g = torch.Generator().manual_seed(idx)
-        lat = torch.randn(self.cfg.latent_channels, self.cfg.latent_size,
-                          self.cfg.latent_size, generator=g)
-        txt = torch.randn(self.cfg.text_length, self.cfg.text_dim, generator=g)
-        return lat, txt
-# ============================================================================
-# Stage Freezing
-# ============================================================================
-def freeze_for_stage(model: ArtFlow, stage: int) -> ArtFlow:
-    """Freeze / unfreeze modules based on training stage."""
-    for p in model.parameters():
-        p.requires_grad_(True)
-    freeze_keys = {
-        1: ['art_style', 'mood_ctrl', 'concept_engine'],
-        2: ['mood_ctrl', 'concept_engine'],
-        3: ['mood_ctrl', 'concept_engine'],
-        4: [],  # freeze everything *except* concept/mood
-        5: [],
-    }
     if stage == 4:
-        # Stage 4: only concept/mood train
         for n, p in model.named_parameters():
-            if not any(k in n for k in ['mood_ctrl', 'concept_engine']):
-                p.requires_grad_(False)
     else:
-        keys = freeze_keys.get(stage, [])
         for n, p in model.named_parameters():
-            if any(k in n for k in keys):
-                p.requires_grad_(False)
     tr = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tot = sum(p.numel() for p in model.parameters())
     print(f"Stage {stage}: {tr:,}/{tot:,} trainable ({100*tr/tot:.1f}%)")
     return model
-# ============================================================================
-# Training Engine
-# ============================================================================
 class TrainingEngine:
-    """Complete training loop with all stability mechanisms."""
-    def __init__(self, model: ArtFlow, model_cfg: ArtFlowConfig,
-                 train_cfg: TrainConfig, device: torch.device):
-        self.model = model
-        self.mcfg = model_cfg
-        self.tcfg = train_cfg
-        self.device = device
-        # EMA
         self.ema = ArtFlow(model_cfg).to(device)
         self.ema.load_state_dict(model.state_dict())
         self.ema.eval()
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-        # Optimizer
         decay, no_decay = [], []
         for n, p in model.named_parameters():
-            if not p.requires_grad:
-                continue
             (no_decay if ('norm' in n or 'bias' in n) else decay).append(p)
         self.optimizer = torch.optim.AdamW([
             {'params': decay, 'weight_decay': train_cfg.weight_decay},
-            {'params': no_decay, 'weight_decay': 0.0},
         ], lr=train_cfg.lr, betas=train_cfg.betas)
-        # AMP scaler (only useful on CUDA)
         self.use_amp = (device.type == 'cuda')
         self.scaler = torch.amp.GradScaler(device.type, enabled=self.use_amp)
-        # Loss
         self.loss_fn = ArtFlowLoss()
-        # State
         self.global_step = 0
-        self.losses = []
-        self.grad_norms = []
-    # --- LR schedule ---
-    def _lr_scale(self) -> float:
         s, w, total = self.global_step, self.tcfg.warmup_steps, self.tcfg.num_steps
-        mn = self.tcfg.min_lr_ratio
-        if s < w:
-            return s / max(w, 1)
-        prog = (s - w) / max(total - w, 1)
-        return mn + 0.5 * (1 - mn) * (1 + math.cos(math.pi * prog))
-    def _set_lr(self) -> float:
-        sc = self._lr_scale()
-        lr = self.tcfg.lr * sc
-        for pg in self.optimizer.param_groups:
-            pg['lr'] = lr
         return lr
-    # --- EMA ---
     @torch.no_grad()
     def _update_ema(self):
-        if self.global_step < self.tcfg.ema_start_step:
-            return
         d = self.tcfg.ema_decay
         for ep, p in zip(self.ema.parameters(), self.model.parameters()):
-            ep.data.mul_(d).add_(p.data, alpha=1 - d)
-    # --- Single micro-batch ---
-    def micro_step(self, x_0: torch.Tensor, text_emb: torch.Tensor
-                   ) -> Optional[float]:
         B = x_0.shape[0]
         t = logit_normal_timestep(B, self.device)
         eps = torch.randn_like(x_0)
         te = t[:, None, None, None]
-        x_t = (1 - te) * x_0 + te * eps
-        v_target = eps - x_0
-        # Modern autocast (only on CUDA)
         with torch.amp.autocast(self.device.type, dtype=torch.float16, enabled=self.use_amp):
-            v_pred = self.model(x_t, t, text_emb)
-            loss, is_spike = self.loss_fn(v_pred.float(), v_target.float(), t)
             loss = loss / self.tcfg.grad_accum
-        if is_spike:
-            return None  # skip
         self.scaler.scale(loss).backward()
         return loss.item() * self.tcfg.grad_accum
-    # --- Optimizer step (after accumulation) ---
-    def optim_step(self) -> float:
         self.scaler.unscale_(self.optimizer)
-        gn = torch.nn.utils.clip_grad_norm_(
-            [p for p in self.model.parameters() if p.requires_grad],
-            self.tcfg.max_grad_norm).item()
         self.scaler.step(self.optimizer)
         self.scaler.update()
         self.optimizer.zero_grad(set_to_none=True)
@@ -281,134 +243,76 @@ class TrainingEngine:
         self.global_step += 1
         return gn
-    # --- Save / Load ---
-    def save(self, path: Optional[str] = None):
-        path = path or os.path.join(
-            self.tcfg.output_dir, f'ckpt_{self.global_step}.pt')
         os.makedirs(os.path.dirname(path), exist_ok=True)
-        torch.save({
-            'model': self.model.state_dict(),
-            'ema': self.ema.state_dict(),
-            'optimizer': self.optimizer.state_dict(),
-            'scaler': self.scaler.state_dict(),
-            'step': self.global_step,
-            'losses': self.losses[-2000:],
-            'model_config': asdict(self.mcfg),
-            'train_config': asdict(self.tcfg),
-        }, path)
         print(f"  💾 Saved: {path}")
-    def load(self, path: str):
         ckpt = torch.load(path, map_location=self.device, weights_only=False)
-        self.model.load_state_dict(ckpt['model'])
-        self.ema.load_state_dict(ckpt['ema'])
-        self.optimizer.load_state_dict(ckpt['optimizer'])
-        self.scaler.load_state_dict(ckpt['scaler'])
-        self.global_step = ckpt['step']
-        self.losses = ckpt.get('losses', [])
         print(f"  📂 Resumed from step {self.global_step}")
-# ============================================================================
-# Main training loop
-# ============================================================================
-def train(model: ArtFlow, model_cfg: ArtFlowConfig, train_cfg: TrainConfig,
-          dataset: Dataset, device: torch.device,
-          resume_path: Optional[str] = None):
-    """Run one stage of training. Returns the engine for inspection."""
     engine = TrainingEngine(model, model_cfg, train_cfg, device)
-    if resume_path and os.path.exists(resume_path):
-        engine.load(resume_path)
-    loader = DataLoader(dataset, batch_size=train_cfg.batch_size,
-                        shuffle=True, num_workers=0, drop_last=True,
-                        pin_memory=(device.type == 'cuda'))
-    print(f"\n{'='*60}")
-    print(f"Stage {train_cfg.stage} — {engine.global_step} → {train_cfg.num_steps} steps")
-    print(f"Effective batch: {train_cfg.batch_size} × {train_cfg.grad_accum}"
-          f" = {train_cfg.batch_size * train_cfg.grad_accum}")
-    print(f"{'='*60}\n")
     model.train()
     start = time.time()
     acc_loss, acc_n = 0.0, 0
     while engine.global_step < train_cfg.num_steps:
         for x_0, txt in loader:
-            if engine.global_step >= train_cfg.num_steps:
-                break
             x_0, txt = x_0.to(device), txt.to(device)
             engine._set_lr()
             lv = engine.micro_step(x_0, txt)
-            if lv is not None:
-                acc_loss += lv
-                acc_n += 1
             if acc_n >= train_cfg.grad_accum:
                 gn = engine.optim_step()
-                avg = acc_loss / acc_n
-                engine.losses.append(avg)
-                engine.grad_norms.append(gn)
                 acc_loss, acc_n = 0.0, 0
                 if engine.global_step % train_cfg.log_every == 0:
-                    el = time.time() - start
-                    sps = engine.global_step / max(el, 1)
-                    eta = (train_cfg.num_steps - engine.global_step) / max(sps, 1e-6)
-                    lr = engine.optimizer.param_groups[0]['lr']
                     rec = engine.losses[-50:]
-                    print(f"Step {engine.global_step:>6d}/{train_cfg.num_steps} | "
-                          f"Loss: {sum(rec)/len(rec):.4f} | GN: {gn:.3f} | "
-                          f"LR: {lr:.2e} | ETA: {eta/60:.0f}m")
-                if engine.global_step % train_cfg.save_every == 0:
-                    engine.save()
-    engine.save(os.path.join(train_cfg.output_dir,
-                             f'stage{train_cfg.stage}_final.pt'))
     print(f"\n✅ Stage {train_cfg.stage} done — {(time.time()-start)/3600:.1f}h")
     return engine
-# ============================================================================
-# CLI entry point
-# ============================================================================
 if __name__ == '__main__':
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"Device: {device}")
-    # Use small config for CPU testing
-    mcfg = ArtFlowConfig(
-        latent_channels=4, latent_size=16,
-        stage_channels=(64, 128, 192),
-        blocks_per_stage=(1, 1, 1), bottleneck_blocks=2,
-        mamba_state_dim=8, num_styles=16, style_dim=128,
-        mood_dim=64, num_moods=8, text_dim=256, text_length=16,
-        num_heads=4, concept_dim=64, kan_grid_size=3,
-    )
     model = ArtFlow(mcfg).to(device)
-    model = freeze_for_stage(model, stage=1)
-    total = sum(p.numel() for p in model.parameters())
-    print(f"Model: {total:,} params ({total/1e6:.1f}M)")
-    tcfg = TrainConfig(num_steps=30, log_every=10, save_every=100,
-                       batch_size=2, grad_accum=2, warmup_steps=5)
-    ds = SyntheticDataset(n=200, config=mcfg)
-    engine = train(model, mcfg, tcfg, ds, device)
-    # Verify
-    print(f"\n--- Verification ---")
-    print(f"Steps completed: {engine.global_step}")
-    print(f"Losses recorded: {len(engine.losses)}")
-    if engine.losses:
-        print(f"Last 5 losses: {[f'{l:.4f}' for l in engine.losses[-5:]]}")
     has_nan = any(torch.isnan(p).any() for p in model.parameters())
-    print(f"NaN in params: {'FAIL' if has_nan else 'OK'}")
     print("✅ All good" if not has_nan and engine.global_step >= 30 else "❌ Issues")

 """
+ArtFlow v2 Training Utilities
+==============================
+Real Mamba SSM training with:
+  - Real dataset support (WikiArt, Teyvat, Pokemon, Danbooru tags)
+  - Pseudo-Huber + Min-SNR-γ + Art-Aware Frequency loss
+  - Stable training with spike detection and EMA
+  - Multi-stage freeze/unfreeze pipeline
+  - Push-to-Hub support for HF Jobs
 Uses only modern, non-deprecated PyTorch APIs.
 """
 import json
 import time
 from dataclasses import dataclass, asdict
+from typing import Tuple, Optional, List
 from collections import deque
 import torch
     ArtFlow, ArtFlowConfig, HaarWavelet2D, logit_normal_timestep
 )
 class ArtFlowLoss(nn.Module):
+    def __init__(self, huber_c=0.00054, min_snr_gamma=5.0,
+                 use_pseudo_huber=True, use_min_snr=True,
+                 w_LL=1.0, w_LH=2.0, w_HL=2.0, w_HH=1.5):
         super().__init__()
         self.huber_c = huber_c
         self.min_snr_gamma = min_snr_gamma
         self.freq_weights = {'LL': w_LL, 'LH': w_LH, 'HL': w_HL, 'HH': w_HH}
         self.loss_ema = None
+    def pseudo_huber(self, x):
         return (x.pow(2) + self.huber_c ** 2).sqrt() - self.huber_c
+    def snr_weight(self, t):
         snr = ((1 - t) / t.clamp(min=1e-6)).pow(2)
         w = torch.clamp(snr, max=self.min_snr_gamma) / snr.clamp(min=1e-6)
         return w[:, None, None, None]
+    def forward(self, v_pred, v_target, t):
         error = v_pred - v_target
+        elem = self.pseudo_huber(error) if self.use_pseudo_huber else error.pow(2)
         if self.use_min_snr:
             elem = elem * self.snr_weight(t)
         if elem.shape[2] % 2 == 0 and elem.shape[3] % 2 == 0:
             LL, LH, HL, HH = self.wavelet(elem)
             loss = (self.freq_weights['LL'] * LL.mean() +
                     self.freq_weights['HH'] * HH.mean())
         else:
             loss = elem.mean()
         lv = loss.item()
+        if self.loss_ema is None: self.loss_ema = lv
+        else: self.loss_ema = 0.99 * self.loss_ema + 0.01 * lv
+        return loss, lv > 10.0 * max(self.loss_ema, 0.01)
 @dataclass
 class TrainConfig:
     betas: Tuple[float, float] = (0.9, 0.99)
     max_grad_norm: float = 1.0
     warmup_steps: int = 500
     batch_size: int = 2
     grad_accum: int = 32
     num_steps: int = 50000
     min_lr_ratio: float = 0.05
     ema_decay: float = 0.9999
     ema_start_step: int = 1000
     log_every: int = 50
     save_every: int = 2500
     output_dir: str = './artflow_ckpts'
     stage: int = 1
+    push_to_hub: bool = False
+    hub_model_id: str = ''
 class SyntheticDataset(Dataset):
+    def __init__(self, n=10000, config=None):
         self.n = n
         self.cfg = config or ArtFlowConfig()
+    def __len__(self): return self.n
     def __getitem__(self, idx):
         g = torch.Generator().manual_seed(idx)
+        return (torch.randn(self.cfg.latent_channels, self.cfg.latent_size, self.cfg.latent_size, generator=g),
+                torch.randn(self.cfg.text_length, self.cfg.text_dim, generator=g))
+class RealArtDataset(Dataset):
+    """Real illustration dataset from HF Hub (WikiArt, Teyvat, Pokemon, etc.)"""
+    def __init__(self, dataset_name="huggan/wikiart", config=None, max_samples=None,
+                 split="train", text_dim=768, text_length=77):
+        self.cfg = config or ArtFlowConfig()
+        self.text_dim, self.text_length = text_dim, text_length
+        self.latent_size = self.cfg.latent_size
+        self.latent_channels = self.cfg.latent_channels
+        print(f"Loading dataset: {dataset_name} ...")
+        from datasets import load_dataset
+        import torchvision.transforms as T
+        try:
+            ds = load_dataset(dataset_name, split=split, trust_remote_code=True)
+        except Exception as e:
+            print(f"  Streaming: {e}")
+            ds = load_dataset(dataset_name, split=split, streaming=True, trust_remote_code=True)
+            items = []
+            for i, item in enumerate(ds):
+                if max_samples and i >= max_samples: break
+                items.append(item)
+            from datasets import Dataset as HFD
+            ds = HFD.from_list(items)
+        if max_samples and len(ds) > max_samples:
+            ds = ds.select(range(max_samples))
+        self.ds = ds
+        self.columns = ds.column_names
+        self.image_col = next((c for c in ['image','img','pixel_values'] if c in self.columns), None)
+        self.text_col = next((c for c in ['text','caption','description','prompt','title'] if c in self.columns), None)
+        self.style_col = next((c for c in ['style','genre','artist'] if c in self.columns), None)
+        target_px = self.latent_size * 8
+        self.transform = T.Compose([T.Resize((target_px, target_px)), T.ToTensor(), T.Normalize([0.5],[0.5])])
+        self.pseudo_encoder = nn.Sequential(
+            nn.Conv2d(3, 32, 4, stride=4), nn.SiLU(), nn.Conv2d(32, self.latent_channels, 4, stride=2, padding=1))
+        for p in self.pseudo_encoder.parameters(): p.requires_grad_(False)
+        print(f"  Loaded {len(self.ds)} samples | img={self.image_col} txt={self.text_col} style={self.style_col}")
+    def __len__(self): return len(self.ds)
+    def __getitem__(self, idx):
+        item = self.ds[idx]
+        if self.image_col and item.get(self.image_col) is not None:
+            img = item[self.image_col]
+            if hasattr(img, 'convert'): img = img.convert('RGB')
+            with torch.no_grad():
+                latent = self.pseudo_encoder(self.transform(img).unsqueeze(0)).squeeze(0)
+                if latent.shape[1] != self.latent_size or latent.shape[2] != self.latent_size:
+                    latent = F.interpolate(latent.unsqueeze(0), size=(self.latent_size, self.latent_size),
+                                          mode='bilinear', align_corners=False).squeeze(0)
+        else:
+            latent = torch.randn(self.latent_channels, self.latent_size, self.latent_size)
+        if self.text_col and item.get(self.text_col):
+            text = str(item[self.text_col])
+            g = torch.Generator().manual_seed(hash(text) % (2**31))
+            text_emb = torch.randn(self.text_length, self.text_dim, generator=g) * 0.1
+            text_emb[:min(len(text.split()), self.text_length)] *= 2.0
+        else:
+            text_emb = torch.randn(self.text_length, self.text_dim) * 0.1
+        return latent, text_emb
+def freeze_for_stage(model, stage):
+    for p in model.parameters(): p.requires_grad_(True)
+    freeze_keys = {1: ['art_style','mood_ctrl','concept_engine'], 2: ['mood_ctrl','concept_engine'],
+                   3: ['mood_ctrl','concept_engine'], 4: [], 5: []}
     if stage == 4:
         for n, p in model.named_parameters():
+            if not any(k in n for k in ['mood_ctrl','concept_engine']): p.requires_grad_(False)
     else:
         for n, p in model.named_parameters():
+            if any(k in n for k in freeze_keys.get(stage, [])): p.requires_grad_(False)
     tr = sum(p.numel() for p in model.parameters() if p.requires_grad)
     tot = sum(p.numel() for p in model.parameters())
     print(f"Stage {stage}: {tr:,}/{tot:,} trainable ({100*tr/tot:.1f}%)")
     return model
 class TrainingEngine:
+    def __init__(self, model, model_cfg, train_cfg, device):
+        self.model, self.mcfg, self.tcfg, self.device = model, model_cfg, train_cfg, device
         self.ema = ArtFlow(model_cfg).to(device)
         self.ema.load_state_dict(model.state_dict())
         self.ema.eval()
+        for p in self.ema.parameters(): p.requires_grad_(False)
         decay, no_decay = [], []
         for n, p in model.named_parameters():
+            if not p.requires_grad: continue
             (no_decay if ('norm' in n or 'bias' in n) else decay).append(p)
         self.optimizer = torch.optim.AdamW([
             {'params': decay, 'weight_decay': train_cfg.weight_decay},
+            {'params': no_decay, 'weight_decay': 0.0}
         ], lr=train_cfg.lr, betas=train_cfg.betas)
         self.use_amp = (device.type == 'cuda')
         self.scaler = torch.amp.GradScaler(device.type, enabled=self.use_amp)
         self.loss_fn = ArtFlowLoss()
         self.global_step = 0
+        self.losses, self.grad_norms = [], []
+    def _lr_scale(self):
         s, w, total = self.global_step, self.tcfg.warmup_steps, self.tcfg.num_steps
+        if s < w: return s / max(w, 1)
+        return self.tcfg.min_lr_ratio + 0.5 * (1 - self.tcfg.min_lr_ratio) * (1 + math.cos(math.pi * (s-w)/max(total-w,1)))
+    def _set_lr(self):
+        lr = self.tcfg.lr * self._lr_scale()
+        for pg in self.optimizer.param_groups: pg['lr'] = lr
         return lr
     @torch.no_grad()
     def _update_ema(self):
+        if self.global_step < self.tcfg.ema_start_step: return
         d = self.tcfg.ema_decay
         for ep, p in zip(self.ema.parameters(), self.model.parameters()):
+            ep.data.mul_(d).add_(p.data, alpha=1-d)
+    def micro_step(self, x_0, text_emb):
         B = x_0.shape[0]
         t = logit_normal_timestep(B, self.device)
         eps = torch.randn_like(x_0)
         te = t[:, None, None, None]
         with torch.amp.autocast(self.device.type, dtype=torch.float16, enabled=self.use_amp):
+            v_pred = self.model((1-te)*x_0 + te*eps, t, text_emb)
+            loss, spike = self.loss_fn(v_pred.float(), (eps-x_0).float(), t)
             loss = loss / self.tcfg.grad_accum
+        if spike: return None
         self.scaler.scale(loss).backward()
         return loss.item() * self.tcfg.grad_accum
+    def optim_step(self):
         self.scaler.unscale_(self.optimizer)
+        gn = torch.nn.utils.clip_grad_norm_([p for p in self.model.parameters() if p.requires_grad], self.tcfg.max_grad_norm).item()
         self.scaler.step(self.optimizer)
         self.scaler.update()
         self.optimizer.zero_grad(set_to_none=True)
         self.global_step += 1
         return gn
+    def save(self, path=None):
+        path = path or os.path.join(self.tcfg.output_dir, f'ckpt_{self.global_step}.pt')
         os.makedirs(os.path.dirname(path), exist_ok=True)
+        torch.save({'model': self.model.state_dict(), 'ema': self.ema.state_dict(),
+                     'optimizer': self.optimizer.state_dict(), 'scaler': self.scaler.state_dict(),
+                     'step': self.global_step, 'losses': self.losses[-2000:],
+                     'model_config': asdict(self.mcfg), 'train_config': asdict(self.tcfg)}, path)
         print(f"  💾 Saved: {path}")
+    def load(self, path):
         ckpt = torch.load(path, map_location=self.device, weights_only=False)
+        self.model.load_state_dict(ckpt['model']); self.ema.load_state_dict(ckpt['ema'])
+        self.optimizer.load_state_dict(ckpt['optimizer']); self.scaler.load_state_dict(ckpt['scaler'])
+        self.global_step = ckpt['step']; self.losses = ckpt.get('losses', [])
         print(f"  📂 Resumed from step {self.global_step}")
+def train(model, model_cfg, train_cfg, dataset, device, resume_path=None):
     engine = TrainingEngine(model, model_cfg, train_cfg, device)
+    if resume_path and os.path.exists(resume_path): engine.load(resume_path)
+    loader = DataLoader(dataset, batch_size=train_cfg.batch_size, shuffle=True,
+                        num_workers=0, drop_last=True, pin_memory=(device.type=='cuda'))
+    print(f"\n{'='*60}\nStage {train_cfg.stage} — {engine.global_step} → {train_cfg.num_steps} steps")
+    print(f"Effective batch: {train_cfg.batch_size} × {train_cfg.grad_accum} = {train_cfg.batch_size*train_cfg.grad_accum}\n{'='*60}\n")
     model.train()
     start = time.time()
     acc_loss, acc_n = 0.0, 0
     while engine.global_step < train_cfg.num_steps:
         for x_0, txt in loader:
+            if engine.global_step >= train_cfg.num_steps: break
             x_0, txt = x_0.to(device), txt.to(device)
             engine._set_lr()
             lv = engine.micro_step(x_0, txt)
+            if lv is not None: acc_loss += lv; acc_n += 1
             if acc_n >= train_cfg.grad_accum:
                 gn = engine.optim_step()
+                engine.losses.append(acc_loss/acc_n); engine.grad_norms.append(gn)
                 acc_loss, acc_n = 0.0, 0
                 if engine.global_step % train_cfg.log_every == 0:
+                    el = time.time()-start; sps = engine.global_step/max(el,1)
                     rec = engine.losses[-50:]
+                    print(f"Step {engine.global_step:>6d}/{train_cfg.num_steps} | Loss: {sum(rec)/len(rec):.4f} | "
+                          f"GN: {gn:.3f} | LR: {engine.optimizer.param_groups[0]['lr']:.2e} | "
+                          f"ETA: {(train_cfg.num_steps-engine.global_step)/max(sps,1e-6)/60:.0f}m")
+                if engine.global_step % train_cfg.save_every == 0: engine.save()
+    final_path = os.path.join(train_cfg.output_dir, f'stage{train_cfg.stage}_final.pt')
+    engine.save(final_path)
+    if train_cfg.push_to_hub and train_cfg.hub_model_id:
+        try:
+            from huggingface_hub import HfApi
+            HfApi().upload_file(path_or_fileobj=final_path, path_in_repo=f'stage{train_cfg.stage}_final.pt',
+                               repo_id=train_cfg.hub_model_id)
+            print(f"  📤 Pushed to {train_cfg.hub_model_id}")
+        except Exception as e: print(f"  ⚠️ Push failed: {e}")
     print(f"\n✅ Stage {train_cfg.stage} done — {(time.time()-start)/3600:.1f}h")
     return engine
 if __name__ == '__main__':
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     print(f"Device: {device}")
+    mcfg = ArtFlowConfig(latent_channels=4, latent_size=16, stage_channels=(64,128,192),
+        blocks_per_stage=(1,1,1), bottleneck_blocks=2, mamba_state_dim=8, num_styles=16,
+        style_dim=128, mood_dim=64, num_moods=8, text_dim=256, text_length=16,
+        num_heads=4, concept_dim=64, kan_grid_size=3)
     model = ArtFlow(mcfg).to(device)
+    model = freeze_for_stage(model, 1)
+    print(f"Model: {sum(p.numel() for p in model.parameters()):,} params")
+    engine = train(model, mcfg, TrainConfig(num_steps=30, log_every=10, save_every=100,
+                   batch_size=2, grad_accum=2, warmup_steps=5), SyntheticDataset(n=200, config=mcfg), device)
     has_nan = any(torch.isnan(p).any() for p in model.parameters())
+    print(f"Steps: {engine.global_step} | NaN: {'FAIL' if has_nan else 'OK'}")
     print("✅ All good" if not has_nan and engine.global_step >= 30 else "❌ Issues")