AbstractPhil
/

tiny-flux-deep

+# ============================================================================
+# TinyFlux-Deep Training Cell - With Expert Distillation (Precached)
+# ============================================================================
+# Integrates SD1.5-flow-lune as a frozen timestep expert.
+# Expert features are PRECACHED at 10 timestep buckets for speed.
+# The ExpertPredictor learns to emulate expert features from (t, CLIP).
+# At inference, no expert needed - predictor runs standalone.
+#
+# USAGE: Run model.py cell first, then this cell
+# ============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from datasets import load_dataset, concatenate_datasets
+from transformers import T5EncoderModel, T5Tokenizer, CLIPTextModel, CLIPTokenizer
+from huggingface_hub import HfApi, hf_hub_download
+from safetensors.torch import save_file, load_file
+from torch.utils.tensorboard import SummaryWriter
+from tqdm.auto import tqdm
+import numpy as np
+import math
+import json
+import random
+from typing import Tuple, Optional, Dict, List
+import os
+from datetime import datetime
+from PIL import Image
+# ============================================================================
+# CUDA OPTIMIZATIONS
+# ============================================================================
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.set_float32_matmul_precision('high')
+import warnings
+warnings.filterwarnings('ignore', message='.*TF32.*')
+# ============================================================================
+# CONFIG
+# ============================================================================
+BATCH_SIZE = 16
+GRAD_ACCUM = 2
+LR = 3e-4
+EPOCHS = 40
+MAX_SEQ = 128
+SHIFT = 3.0
+DEVICE = "cuda"
+DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+ALLOW_WEIGHT_UPGRADE = True
+# HuggingFace Hub
+HF_REPO = "AbstractPhil/tiny-flux-deep"
+SAVE_EVERY = 625
+UPLOAD_EVERY = 625
+SAMPLE_EVERY = 312
+LOG_EVERY = 10
+LOG_UPLOAD_EVERY = 625
+# Checkpoint loading
+LOAD_TARGET = "hub:step_305000"
+RESUME_STEP = None
+# ============================================================================
+# EXPERT DISTILLATION CONFIG
+# ============================================================================
+ENABLE_EXPERT_DISTILLATION = True
+EXPERT_CHECKPOINT = "AbstractPhil/sd15-flow-lune-flux"
+EXPERT_CHECKPOINT_PATH = "flux_t2_6_pose_t4_6_port_t1_4/checkpoint-00018765/unet/diffusion_pytorch_model.safetensors"
+EXPERT_DIM = 1280
+EXPERT_HIDDEN_DIM = 512
+EXPERT_DROPOUT = 0.1  # Prob of forcing predictor (applied outside model)
+DISTILL_LOSS_WEIGHT = 0.1
+DISTILL_WARMUP_STEPS = 1000
+# Timestep buckets for precaching
+EXPERT_T_BUCKETS = torch.linspace(0.05, 0.95, 10)
+# ============================================================================
+# DATASET CONFIG
+# ============================================================================
+ENABLE_PORTRAIT = False
+ENABLE_SCHNELL = True
+ENABLE_SPORTFASHION = False
+ENABLE_SYNTHMOCAP = False
+PORTRAIT_REPO = "AbstractPhil/ffhq_flux_latents_repaired"
+PORTRAIT_NUM_SHARDS = 11
+SCHNELL_REPO = "AbstractPhil/flux-schnell-teacher-latents"
+SCHNELL_CONFIGS = ["train_512"]
+SPORTFASHION_REPO = "Pianokill/SportFashion_512x512"
+SYNTHMOCAP_REPO = "toyxyz/SynthMoCap_smpl_512"
+FG_LOSS_WEIGHT = 2.0
+BG_LOSS_WEIGHT = 0.5
+USE_MASKED_LOSS = False
+MIN_SNR_GAMMA = 5.0
+# Paths
+CHECKPOINT_DIR = "./tiny_flux_deep_checkpoints"
+LOG_DIR = "./tiny_flux_deep_logs"
+SAMPLE_DIR = "./tiny_flux_deep_samples"
+ENCODING_CACHE_DIR = "./encoding_cache"
+LATENT_CACHE_DIR = "./latent_cache"
+os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+os.makedirs(LOG_DIR, exist_ok=True)
+os.makedirs(SAMPLE_DIR, exist_ok=True)
+os.makedirs(ENCODING_CACHE_DIR, exist_ok=True)
+os.makedirs(LATENT_CACHE_DIR, exist_ok=True)
+# ============================================================================
+# REGULARIZATION CONFIG
+# ============================================================================
+TEXT_DROPOUT = 0.1
+GUIDANCE_DROPOUT = 0.1
+EMA_DECAY = 0.9999
+# ============================================================================
+# EXPERT FEATURE CACHE (precached, fast lookup + interpolation)
+# ============================================================================
+class ExpertFeatureCache:
+    """
+    Precached SD1.5-flow expert features with timestep interpolation.
+    Features extracted at 10 timestep buckets [0.05, 0.15, ..., 0.95].
+    At runtime, interpolates between nearest buckets.
+    """
+    def __init__(self, features: torch.Tensor, t_buckets: torch.Tensor, dtype=torch.float16):
+        self.features = features.to(dtype)  # [N, 10, 1280]
+        self.t_buckets = t_buckets
+        self.t_min = t_buckets[0].item()
+        self.t_max = t_buckets[-1].item()
+        self.t_step = (t_buckets[1] - t_buckets[0]).item()
+        self.n_buckets = len(t_buckets)
+        self.dtype = dtype
+    def get_features(self, indices: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        """
+        Get interpolated expert features.
+        Args:
+            indices: [B] sample indices into dataset
+            timesteps: [B] timesteps in [0, 1]
+        Returns:
+            [B, 1280] interpolated features
+        """
+        device = timesteps.device
+        # Clamp to valid range
+        t_clamped = timesteps.float().clamp(self.t_min, self.t_max)
+        # Find bucket indices
+        t_idx_float = (t_clamped - self.t_min) / self.t_step
+        t_idx_low = t_idx_float.long().clamp(0, self.n_buckets - 2)
+        t_idx_high = (t_idx_low + 1).clamp(0, self.n_buckets - 1)
+        # Interpolation alpha
+        alpha = (t_idx_float - t_idx_low.float()).unsqueeze(-1)  # [B, 1]
+        # Gather (on CPU for large caches)
+        idx_cpu = indices.cpu()
+        t_low_cpu = t_idx_low.cpu()
+        t_high_cpu = t_idx_high.cpu()
+        f_low = self.features[idx_cpu, t_low_cpu]   # [B, 1280]
+        f_high = self.features[idx_cpu, t_high_cpu]  # [B, 1280]
+        # Interpolate and move to device
+        result = (1 - alpha.cpu()) * f_low + alpha.cpu() * f_high
+        return result.to(device=device, dtype=self.dtype)
+def load_or_extract_expert_features(cache_path: str, prompts: List[str], name: str,
+                                     clip_tok, clip_enc, t_buckets: torch.Tensor,
+                                     batch_size: int = 32) -> Optional[ExpertFeatureCache]:
+    """
+    Load cached expert features or extract them from SD1.5-flow.
+    Follows same pattern as load_or_encode for text embeddings.
+    """
+    if not prompts or not ENABLE_EXPERT_DISTILLATION:
+        return None
+    # Check cache
+    if os.path.exists(cache_path):
+        print(f"Loading cached {name} expert features...")
+        cached = torch.load(cache_path, map_location="cpu")
+        cache = ExpertFeatureCache(cached["features"], cached["t_buckets"], DTYPE)
+        print(f"  ✓ Loaded {cache.features.shape[0]} samples × {cache.n_buckets} timesteps")
+        return cache
+    # Extract features
+    print(f"Extracting {name} expert features ({len(prompts)} × {len(t_buckets)} timesteps)...")
+    print(f"  This is a one-time operation, will be cached for future runs.")
+    # Load expert model temporarily
+    checkpoint_path = hf_hub_download(
+        repo_id=EXPERT_CHECKPOINT,
+        filename=EXPERT_CHECKPOINT_PATH,
+    )
+    from diffusers import UNet2DConditionModel
+    unet = UNet2DConditionModel.from_pretrained(
+        "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        subfolder="unet",
+        torch_dtype=DTYPE,
+    ).to(DEVICE).eval()
+    state_dict = load_file(checkpoint_path)
+    unet.load_state_dict(state_dict, strict=False)
+    for p in unet.parameters():
+        p.requires_grad = False
+    # Hook for mid-block features
+    mid_features = [None]
+    def hook_fn(module, inp, out):
+        mid_features[0] = out.mean(dim=[2, 3])
+    unet.mid_block.register_forward_hook(hook_fn)
+    # Extract
+    n_prompts = len(prompts)
+    n_buckets = len(t_buckets)
+    all_features = torch.zeros(n_prompts, n_buckets, EXPERT_DIM, dtype=torch.float16)
+    with torch.no_grad():
+        for start_idx in tqdm(range(0, n_prompts, batch_size), desc=f"Extracting {name}"):
+            end_idx = min(start_idx + batch_size, n_prompts)
+            batch_prompts = prompts[start_idx:end_idx]
+            B = len(batch_prompts)
+            # Encode CLIP hidden states
+            clip_inputs = clip_tok(
+                batch_prompts, return_tensors="pt", padding="max_length",
+                max_length=77, truncation=True
+            ).to(DEVICE)
+            clip_hidden = clip_enc(**clip_inputs).last_hidden_state  # [B, 77, 768]
+            # Extract at each timestep bucket
+            for t_idx, t_val in enumerate(t_buckets):
+                timesteps = torch.full((B,), t_val.item(), device=DEVICE)
+                latents = torch.randn(B, 4, 64, 64, device=DEVICE, dtype=DTYPE)
+                _ = unet(latents, timesteps * 1000, encoder_hidden_states=clip_hidden.to(DTYPE))
+                all_features[start_idx:end_idx, t_idx] = mid_features[0].cpu().to(torch.float16)
+    # Cleanup
+    del unet
+    torch.cuda.empty_cache()
+    # Save cache
+    torch.save({"features": all_features, "t_buckets": t_buckets}, cache_path)
+    print(f"  ✓ Cached to {cache_path}")
+    print(f"  Size: {all_features.numel() * 2 / 1e9:.2f} GB")
+    return ExpertFeatureCache(all_features, t_buckets, DTYPE)
+# ============================================================================
+# EMA
+# ============================================================================
+class EMA:
+    def __init__(self, model, decay=0.9999):
+        self.decay = decay
+        self.shadow = {}
+        self._backup = {}
+        if hasattr(model, '_orig_mod'):
+            state = model._orig_mod.state_dict()
+        else:
+            state = model.state_dict()
+        for k, v in state.items():
+            self.shadow[k] = v.clone().detach()
+    @torch.no_grad()
+    def update(self, model):
+        if hasattr(model, '_orig_mod'):
+            state = model._orig_mod.state_dict()
+        else:
+            state = model.state_dict()
+        for k, v in state.items():
+            if k in self.shadow:
+                self.shadow[k].lerp_(v.to(self.shadow[k].dtype), 1 - self.decay)
+    def apply_shadow_for_eval(self, model):
+        if hasattr(model, '_orig_mod'):
+            self._backup = {k: v.clone() for k, v in model._orig_mod.state_dict().items()}
+            model._orig_mod.load_state_dict(self.shadow)
+        else:
+            self._backup = {k: v.clone() for k, v in model.state_dict().items()}
+            model.load_state_dict(self.shadow)
+    def restore(self, model):
+        if hasattr(model, '_orig_mod'):
+            model._orig_mod.load_state_dict(self._backup)
+        else:
+            model.load_state_dict(self._backup)
+        self._backup = {}
+    def state_dict(self):
+        return {'shadow': self.shadow, 'decay': self.decay}
+    def load_state_dict(self, state):
+        self.shadow = {k: v.clone() for k, v in state['shadow'].items()}
+        self.decay = state.get('decay', self.decay)
+    def load_shadow(self, shadow_state):
+        """Load EMA shadow weights, handling architecture changes gracefully."""
+        device = next(iter(self.shadow.values())).device if self.shadow else 'cuda'
+        loaded = 0
+        skipped_old = 0
+        kept_new = 0
+        for k, v in shadow_state.items():
+            if k in self.shadow:
+                # Key exists in current model - load it
+                self.shadow[k] = v.clone().to(device)
+                loaded += 1
+            else:
+                # Key doesn't exist (deprecated like guidance_in)
+                skipped_old += 1
+        # Count new keys not in checkpoint
+        for k in self.shadow:
+            if k not in shadow_state:
+                kept_new += 1
+        print(f"  ✓ Restored EMA: {loaded} loaded, {skipped_old} deprecated skipped, {kept_new} new (fresh init)")
+# ============================================================================
+# REGULARIZATION
+# ============================================================================
+def apply_text_dropout(t5_embeds, clip_pooled, dropout_prob=0.1):
+    B = t5_embeds.shape[0]
+    mask = torch.rand(B, device=t5_embeds.device) < dropout_prob
+    t5_embeds = t5_embeds.clone()
+    clip_pooled = clip_pooled.clone()
+    t5_embeds[mask] = 0
+    clip_pooled[mask] = 0
+    return t5_embeds, clip_pooled, mask
+# ============================================================================
+# MASKING UTILITIES
+# ============================================================================
+def detect_background_color(image: Image.Image, sample_size: int = 100) -> Tuple[int, int, int]:
+    img = np.array(image)
+    if len(img.shape) == 2:
+        img = np.stack([img] * 3, axis=-1)
+    h, w = img.shape[:2]
+    corners = [
+        img[:sample_size, :sample_size],
+        img[:sample_size, -sample_size:],
+        img[-sample_size:, :sample_size],
+        img[-sample_size:, -sample_size:],
+    ]
+    corner_pixels = np.concatenate([c.reshape(-1, 3) for c in corners], axis=0)
+    bg_color = np.median(corner_pixels, axis=0).astype(np.uint8)
+    return tuple(bg_color)
+def create_product_mask(image: Image.Image, threshold: int = 30) -> np.ndarray:
+    img = np.array(image).astype(np.float32)
+    if len(img.shape) == 2:
+        img = np.stack([img] * 3, axis=-1)
+    bg_color = detect_background_color(image)
+    bg_color = np.array(bg_color, dtype=np.float32)
+    diff = np.sqrt(np.sum((img - bg_color) ** 2, axis=-1))
+    mask = (diff > threshold).astype(np.float32)
+    return mask
+def create_smpl_mask(conditioning_image: Image.Image, threshold: int = 20) -> np.ndarray:
+    img = np.array(conditioning_image).astype(np.float32)
+    if len(img.shape) == 2:
+        return (img > threshold).astype(np.float32)
+    r, g, b = img[:, :, 0], img[:, :, 1], img[:, :, 2]
+    is_background = (g > r + 20) & (g > b + 20)
+    mask = (~is_background).astype(np.float32)
+    return mask
+def downsample_mask_to_latent(mask: np.ndarray, latent_h: int = 64, latent_w: int = 64) -> torch.Tensor:
+    mask_pil = Image.fromarray((mask * 255).astype(np.uint8))
+    mask_pil = mask_pil.resize((latent_w, latent_h), Image.Resampling.BILINEAR)
+    mask_latent = np.array(mask_pil).astype(np.float32) / 255.0
+    return torch.from_numpy(mask_latent)
+# ============================================================================
+# HF HUB SETUP
+# ============================================================================
+print("Setting up HuggingFace Hub...")
+api = HfApi()
+# ============================================================================
+# FLOW MATCHING HELPERS
+# ============================================================================
+def flux_shift(t, s=SHIFT):
+    return s * t / (1 + (s - 1) * t)
+def min_snr_weight(t, gamma=MIN_SNR_GAMMA):
+    snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
+    return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)
+# ============================================================================
+# LOAD TEXT ENCODERS
+# ============================================================================
+print("Loading text encoders...")
+t5_tok = T5Tokenizer.from_pretrained("google/flan-t5-base")
+t5_enc = T5EncoderModel.from_pretrained("google/flan-t5-base", torch_dtype=DTYPE).to(DEVICE).eval()
+for p in t5_enc.parameters():
+    p.requires_grad = False
+clip_tok = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+clip_enc = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=DTYPE).to(DEVICE).eval()
+for p in clip_enc.parameters():
+    p.requires_grad = False
+print("✓ Text encoders loaded")
+# ============================================================================
+# LOAD VAE
+# ============================================================================
+print("Loading VAE...")
+from diffusers import AutoencoderKL
+vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=DTYPE).to(DEVICE).eval()
+for p in vae.parameters():
+    p.requires_grad = False
+VAE_SCALE = vae.config.scaling_factor
+print(f"✓ VAE loaded (scale={VAE_SCALE})")
+# ============================================================================
+# ENCODING FUNCTIONS
+# ============================================================================
+@torch.no_grad()
+def encode_prompt(prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
+    t5_inputs = t5_tok(prompt, return_tensors="pt", padding="max_length",
+                       max_length=MAX_SEQ, truncation=True).to(DEVICE)
+    t5_out = t5_enc(**t5_inputs).last_hidden_state
+    clip_inputs = clip_tok(prompt, return_tensors="pt", padding="max_length",
+                           max_length=77, truncation=True).to(DEVICE)
+    clip_out = clip_enc(**clip_inputs).pooler_output
+    return t5_out.squeeze(0), clip_out.squeeze(0)
+@torch.no_grad()
+def encode_prompts_batched(prompts: List[str], batch_size: int = 64) -> Tuple[torch.Tensor, torch.Tensor]:
+    all_t5 = []
+    all_clip = []
+    for i in tqdm(range(0, len(prompts), batch_size), desc="Encoding", leave=False):
+        batch = prompts[i:i+batch_size]
+        t5_inputs = t5_tok(batch, return_tensors="pt", padding="max_length",
+                          max_length=MAX_SEQ, truncation=True).to(DEVICE)
+        t5_out = t5_enc(**t5_inputs).last_hidden_state
+        all_t5.append(t5_out.cpu())
+        clip_inputs = clip_tok(batch, return_tensors="pt", padding="max_length",
+                              max_length=77, truncation=True).to(DEVICE)
+        clip_out = clip_enc(**clip_inputs).pooler_output
+        all_clip.append(clip_out.cpu())
+    return torch.cat(all_t5, dim=0), torch.cat(all_clip, dim=0)
+@torch.no_grad()
+def encode_image_to_latent(image: Image.Image) -> torch.Tensor:
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    if image.size != (512, 512):
+        image = image.resize((512, 512), Image.Resampling.LANCZOS)
+    img_tensor = torch.from_numpy(np.array(image)).float() / 255.0
+    img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0)
+    img_tensor = (img_tensor * 2.0 - 1.0).to(DEVICE, dtype=DTYPE)
+    latent = vae.encode(img_tensor).latent_dist.sample()
+    latent = latent * VAE_SCALE
+    return latent.squeeze(0).cpu()
+# ============================================================================
+# LOAD DATASETS
+# ============================================================================
+portrait_ds = None
+portrait_indices = []
+portrait_prompts = []
+if ENABLE_PORTRAIT:
+    print(f"\n[1/4] Loading portrait dataset from {PORTRAIT_REPO}...")
+    portrait_shards = []
+    for i in range(PORTRAIT_NUM_SHARDS):
+        split_name = f"train_{i:02d}"
+        print(f"  Loading {split_name}...")
+        shard = load_dataset(PORTRAIT_REPO, split=split_name)
+        portrait_shards.append(shard)
+    portrait_ds = concatenate_datasets(portrait_shards)
+    print(f"✓ Portrait: {len(portrait_ds)} base samples")
+    print("  Extracting prompts (columnar)...")
+    florence_list = list(portrait_ds["text_florence"])
+    llava_list = list(portrait_ds["text_llava"])
+    blip_list = list(portrait_ds["text_blip"])
+    for i, (f, l, b) in enumerate(zip(florence_list, llava_list, blip_list)):
+        if f and f.strip():
+            portrait_indices.append(i)
+            portrait_prompts.append(f)
+        if l and l.strip():
+            portrait_indices.append(i)
+            portrait_prompts.append(l)
+        if b and b.strip():
+            portrait_indices.append(i)
+            portrait_prompts.append(b)
+    print(f"  Expanded: {len(portrait_prompts)} samples (3 prompts/image)")
+else:
+    print("\n[1/4] Portrait dataset DISABLED")
+schnell_ds = None
+schnell_prompts = []
+if ENABLE_SCHNELL:
+    print(f"\n[2/4] Loading schnell teacher dataset from {SCHNELL_REPO}...")
+    schnell_datasets = []
+    for config in SCHNELL_CONFIGS:
+        print(f"  Loading {config}...")
+        ds = load_dataset(SCHNELL_REPO, config, split="train")
+        schnell_datasets.append(ds)
+        print(f"    {len(ds)} samples")
+    schnell_ds = concatenate_datasets(schnell_datasets)
+    schnell_prompts = list(schnell_ds["prompt"])
+    print(f"✓ Schnell: {len(schnell_ds)} samples")
+else:
+    print("\n[2/4] Schnell dataset DISABLED")
+sportfashion_ds = None
+sportfashion_prompts = []
+if ENABLE_SPORTFASHION:
+    print(f"\n[3/4] Loading SportFashion dataset from {SPORTFASHION_REPO}...")
+    sportfashion_ds = load_dataset(SPORTFASHION_REPO, split="train")
+    sportfashion_prompts = list(sportfashion_ds["text"])
+    print(f"✓ SportFashion: {len(sportfashion_ds)} samples")
+else:
+    print("\n[3/4] SportFashion dataset DISABLED")
+synthmocap_ds = None
+synthmocap_prompts = []
+if ENABLE_SYNTHMOCAP:
+    print(f"\n[4/4] Loading SynthMoCap dataset from {SYNTHMOCAP_REPO}...")
+    synthmocap_ds = load_dataset(SYNTHMOCAP_REPO, split="train")
+    synthmocap_prompts = list(synthmocap_ds["text"])
+    print(f"✓ SynthMoCap: {len(synthmocap_ds)} samples")
+else:
+    print("\n[4/4] SynthMoCap dataset DISABLED")
+# ============================================================================
+# ENCODE ALL PROMPTS
+# ============================================================================
+total_samples = len(portrait_prompts) + len(schnell_prompts) + len(sportfashion_prompts) + len(synthmocap_prompts)
+print(f"\nTotal combined samples: {total_samples}")
+def load_or_encode(cache_path, prompts, name):
+    if not prompts:
+        return None, None
+    if os.path.exists(cache_path):
+        print(f"Loading cached {name} encodings...")
+        cached = torch.load(cache_path)
+        return cached["t5_embeds"], cached["clip_pooled"]
+    else:
+        print(f"Encoding {len(prompts)} {name} prompts...")
+        t5, clip = encode_prompts_batched(prompts, batch_size=64)
+        torch.save({"t5_embeds": t5, "clip_pooled": clip}, cache_path)
+        print(f"✓ Cached to {cache_path}")
+        return t5, clip
+# Standard text encodings
+portrait_t5, portrait_clip = None, None
+schnell_t5, schnell_clip = None, None
+sportfashion_t5, sportfashion_clip = None, None
+synthmocap_t5, synthmocap_clip = None, None
+if portrait_prompts:
+    portrait_enc_cache = os.path.join(ENCODING_CACHE_DIR, f"portrait_encodings_{len(portrait_prompts)}.pt")
+    portrait_t5, portrait_clip = load_or_encode(portrait_enc_cache, portrait_prompts, "portrait")
+if schnell_prompts:
+    schnell_enc_cache = os.path.join(ENCODING_CACHE_DIR, f"schnell_encodings_{len(schnell_prompts)}.pt")
+    schnell_t5, schnell_clip = load_or_encode(schnell_enc_cache, schnell_prompts, "schnell")
+if sportfashion_prompts:
+    sportfashion_enc_cache = os.path.join(ENCODING_CACHE_DIR, f"sportfashion_encodings_{len(sportfashion_prompts)}.pt")
+    sportfashion_t5, sportfashion_clip = load_or_encode(sportfashion_enc_cache, sportfashion_prompts, "sportfashion")
+if synthmocap_prompts:
+    synthmocap_enc_cache = os.path.join(ENCODING_CACHE_DIR, f"synthmocap_encodings_{len(synthmocap_prompts)}.pt")
+    synthmocap_t5, synthmocap_clip = load_or_encode(synthmocap_enc_cache, synthmocap_prompts, "synthmocap")
+# ============================================================================
+# EXTRACT/LOAD EXPERT FEATURES (precached)
+# ============================================================================
+print("\n" + "="*60)
+print("Expert Feature Caching")
+print("="*60)
+schnell_expert_cache = None
+portrait_expert_cache = None
+sportfashion_expert_cache = None
+synthmocap_expert_cache = None
+if schnell_prompts and ENABLE_EXPERT_DISTILLATION:
+    schnell_expert_path = os.path.join(ENCODING_CACHE_DIR, f"schnell_expert_{len(schnell_prompts)}.pt")
+    schnell_expert_cache = load_or_extract_expert_features(
+        schnell_expert_path, schnell_prompts, "schnell",
+        clip_tok, clip_enc, EXPERT_T_BUCKETS
+    )
+if portrait_prompts and ENABLE_EXPERT_DISTILLATION:
+    portrait_expert_path = os.path.join(ENCODING_CACHE_DIR, f"portrait_expert_{len(portrait_prompts)}.pt")
+    portrait_expert_cache = load_or_extract_expert_features(
+        portrait_expert_path, portrait_prompts, "portrait",
+        clip_tok, clip_enc, EXPERT_T_BUCKETS
+    )
+if sportfashion_prompts and ENABLE_EXPERT_DISTILLATION:
+    sportfashion_expert_path = os.path.join(ENCODING_CACHE_DIR, f"sportfashion_expert_{len(sportfashion_prompts)}.pt")
+    sportfashion_expert_cache = load_or_extract_expert_features(
+        sportfashion_expert_path, sportfashion_prompts, "sportfashion",
+        clip_tok, clip_enc, EXPERT_T_BUCKETS
+    )
+if synthmocap_prompts and ENABLE_EXPERT_DISTILLATION:
+    synthmocap_expert_path = os.path.join(ENCODING_CACHE_DIR, f"synthmocap_expert_{len(synthmocap_prompts)}.pt")
+    synthmocap_expert_cache = load_or_extract_expert_features(
+        synthmocap_expert_path, synthmocap_prompts, "synthmocap",
+        clip_tok, clip_enc, EXPERT_T_BUCKETS
+    )
+# ============================================================================
+# COMBINED DATASET CLASS (with sample_idx for expert lookup)
+# ============================================================================
+class CombinedDataset(Dataset):
+    """Combined dataset returning sample index for expert feature lookup."""
+    def __init__(
+        self,
+        portrait_ds, portrait_indices, portrait_t5, portrait_clip,
+        schnell_ds, schnell_t5, schnell_clip,
+        sportfashion_ds, sportfashion_t5, sportfashion_clip,
+        synthmocap_ds, synthmocap_t5, synthmocap_clip,
+        vae, vae_scale, device, dtype,
+        compute_masks=True,
+    ):
+        self.portrait_ds = portrait_ds
+        self.portrait_indices = portrait_indices
+        self.portrait_t5 = portrait_t5
+        self.portrait_clip = portrait_clip
+        self.schnell_ds = schnell_ds
+        self.schnell_t5 = schnell_t5
+        self.schnell_clip = schnell_clip
+        self.sportfashion_ds = sportfashion_ds
+        self.sportfashion_t5 = sportfashion_t5
+        self.sportfashion_clip = sportfashion_clip
+        self.synthmocap_ds = synthmocap_ds
+        self.synthmocap_t5 = synthmocap_t5
+        self.synthmocap_clip = synthmocap_clip
+        self.vae = vae
+        self.vae_scale = vae_scale
+        self.device = device
+        self.dtype = dtype
+        self.compute_masks = compute_masks
+        self.n_portrait = len(portrait_indices) if portrait_indices else 0
+        self.n_schnell = len(schnell_ds) if schnell_ds else 0
+        self.n_sportfashion = len(sportfashion_ds) if sportfashion_ds else 0
+        self.n_synthmocap = len(synthmocap_ds) if synthmocap_ds else 0
+        self.c1 = self.n_portrait
+        self.c2 = self.c1 + self.n_schnell
+        self.c3 = self.c2 + self.n_sportfashion
+        self.total = self.c3 + self.n_synthmocap
+    def __len__(self):
+        return self.total
+    def _get_latent_from_array(self, latent_data):
+        if isinstance(latent_data, torch.Tensor):
+            return latent_data.to(self.dtype)
+        return torch.tensor(np.array(latent_data), dtype=self.dtype)
+    @torch.no_grad()
+    def _encode_image(self, image):
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        if image.size != (512, 512):
+            image = image.resize((512, 512), Image.Resampling.LANCZOS)
+        img_tensor = torch.from_numpy(np.array(image)).float() / 255.0
+        img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0)
+        img_tensor = (img_tensor * 2.0 - 1.0).to(self.device, dtype=self.dtype)
+        latent = self.vae.encode(img_tensor).latent_dist.sample()
+        latent = latent * self.vae_scale
+        return latent.squeeze(0).cpu()
+    def __getitem__(self, idx):
+        mask = None
+        # Determine which dataset and local index
+        if idx < self.c1:
+            # Portrait
+            local_idx = idx
+            orig_idx = self.portrait_indices[idx]
+            item = self.portrait_ds[orig_idx]
+            latent = self._get_latent_from_array(item["latent"])
+            t5 = self.portrait_t5[idx]
+            clip = self.portrait_clip[idx]
+            dataset_id = 0
+        elif idx < self.c2:
+            # Schnell
+            local_idx = idx - self.c1
+            item = self.schnell_ds[local_idx]
+            latent = self._get_latent_from_array(item["latent"])
+            t5 = self.schnell_t5[local_idx]
+            clip = self.schnell_clip[local_idx]
+            dataset_id = 1
+        elif idx < self.c3:
+            # SportFashion
+            local_idx = idx - self.c2
+            item = self.sportfashion_ds[local_idx]
+            image = item["image"]
+            latent = self._encode_image(image)
+            t5 = self.sportfashion_t5[local_idx]
+            clip = self.sportfashion_clip[local_idx]
+            dataset_id = 2
+            if self.compute_masks:
+                pixel_mask = create_product_mask(image)
+                mask = downsample_mask_to_latent(pixel_mask, 64, 64)
+        else:
+            # SynthMoCap
+            local_idx = idx - self.c3
+            item = self.synthmocap_ds[local_idx]
+            image = item["image"]
+            conditioning = item["conditioning_image"]
+            latent = self._encode_image(image)
+            t5 = self.synthmocap_t5[local_idx]
+            clip = self.synthmocap_clip[local_idx]
+            dataset_id = 3
+            if self.compute_masks:
+                pixel_mask = create_smpl_mask(conditioning)
+                mask = downsample_mask_to_latent(pixel_mask, 64, 64)
+        result = {
+            "latent": latent,
+            "t5_embed": t5.to(self.dtype),
+            "clip_pooled": clip.to(self.dtype),
+            "sample_idx": idx,  # Global index for expert cache lookup
+            "local_idx": local_idx,  # Local index within dataset
+            "dataset_id": dataset_id,  # Which dataset (0=portrait, 1=schnell, etc)
+        }
+        if mask is not None:
+            result["mask"] = mask.to(self.dtype)
+        return result
+# ============================================================================
+# COLLATE FUNCTION
+# ============================================================================
+def collate_fn(batch):
+    latents = torch.stack([b["latent"] for b in batch])
+    t5_embeds = torch.stack([b["t5_embed"] for b in batch])
+    clip_pooled = torch.stack([b["clip_pooled"] for b in batch])
+    sample_indices = torch.tensor([b["sample_idx"] for b in batch], dtype=torch.long)
+    local_indices = torch.tensor([b["local_idx"] for b in batch], dtype=torch.long)
+    dataset_ids = torch.tensor([b["dataset_id"] for b in batch], dtype=torch.long)
+    masks = None
+    if any("mask" in b for b in batch):
+        masks = []
+        for b in batch:
+            if "mask" in b:
+                masks.append(b["mask"])
+            else:
+                masks.append(torch.ones(64, 64, dtype=latents.dtype))
+        masks = torch.stack(masks)
+    return {
+        "latents": latents,
+        "t5_embeds": t5_embeds,
+        "clip_pooled": clip_pooled,
+        "sample_indices": sample_indices,
+        "local_indices": local_indices,
+        "dataset_ids": dataset_ids,
+        "masks": masks,
+    }
+# ============================================================================
+# EXPERT FEATURE LOOKUP (handles multiple datasets)
+# ============================================================================
+def get_expert_features_for_batch(
+    local_indices: torch.Tensor,
+    dataset_ids: torch.Tensor,
+    timesteps: torch.Tensor,
+    portrait_cache: Optional[ExpertFeatureCache],
+    schnell_cache: Optional[ExpertFeatureCache],
+    sportfashion_cache: Optional[ExpertFeatureCache],
+    synthmocap_cache: Optional[ExpertFeatureCache],
+) -> Optional[torch.Tensor]:
+    """Get expert features from the appropriate cache for each sample."""
+    caches = [portrait_cache, schnell_cache, sportfashion_cache, synthmocap_cache]
+    # Check if any cache is available
+    if not any(c is not None for c in caches):
+        return None
+    B = local_indices.shape[0]
+    device = timesteps.device
+    features = torch.zeros(B, EXPERT_DIM, device=device, dtype=DTYPE)
+    for ds_id, cache in enumerate(caches):
+        if cache is None:
+            continue
+        # Find samples from this dataset
+        mask = dataset_ids == ds_id
+        if not mask.any():
+            continue
+        # Get features for these samples
+        ds_local_indices = local_indices[mask]
+        ds_timesteps = timesteps[mask]
+        ds_features = cache.get_features(ds_local_indices, ds_timesteps)
+        features[mask] = ds_features
+    return features
+# ============================================================================
+# MASKED LOSS FUNCTION
+# ============================================================================
+def masked_mse_loss(pred, target, mask=None, fg_weight=2.0, bg_weight=0.5, snr_weights=None):
+    B, N, C = pred.shape
+    if mask is None:
+        loss_per_sample = ((pred - target) ** 2).mean(dim=[1, 2])
+    else:
+        H = W = int(math.sqrt(N))
+        mask_flat = mask.view(B, H * W, 1).to(pred.device)
+        sq_error = (pred - target) ** 2
+        weights = mask_flat * fg_weight + (1 - mask_flat) * bg_weight
+        weighted_error = sq_error * weights
+        loss_per_sample = weighted_error.mean(dim=[1, 2])
+    if snr_weights is not None:
+        loss_per_sample = loss_per_sample * snr_weights
+    return loss_per_sample.mean()
+# ============================================================================
+# CREATE DATASET
+# ============================================================================
+print("\nCreating combined dataset...")
+combined_ds = CombinedDataset(
+    portrait_ds, portrait_indices, portrait_t5, portrait_clip,
+    schnell_ds, schnell_t5, schnell_clip,
+    sportfashion_ds, sportfashion_t5, sportfashion_clip,
+    synthmocap_ds, synthmocap_t5, synthmocap_clip,
+    vae, VAE_SCALE, DEVICE, DTYPE,
+    compute_masks=USE_MASKED_LOSS,
+)
+print(f"✓ Combined dataset: {len(combined_ds)} samples")
+print(f"  - Portraits (3x):    {combined_ds.n_portrait:,}")
+print(f"  - Schnell teacher:   {combined_ds.n_schnell:,}")
+print(f"  - SportFashion:      {combined_ds.n_sportfashion:,}")
+print(f"  - SynthMoCap:        {combined_ds.n_synthmocap:,}")
+print(f"  - Expert distillation: {ENABLE_EXPERT_DISTILLATION}")
+# ============================================================================
+# DATALOADER
+# ============================================================================
+loader = DataLoader(
+    combined_ds,
+    batch_size=BATCH_SIZE,
+    shuffle=True,
+    num_workers=8,
+    pin_memory=True,
+    collate_fn=collate_fn,
+    drop_last=True,
+)
+print(f"✓ DataLoader: {len(loader)} batches/epoch")
+# ============================================================================
+# SAMPLING FUNCTION
+# ============================================================================
+@torch.inference_mode()
+def generate_samples(model, prompts, num_steps=28, guidance_scale=3.5, H=64, W=64, use_ema=True):
+    was_training = model.training
+    model.eval()
+    if use_ema and 'ema' in globals() and ema is not None:
+        ema.apply_shadow_for_eval(model)
+    B = len(prompts)
+    C = 16
+    t5_list, clip_list = [], []
+    for p in prompts:
+        t5, clip = encode_prompt(p)
+        t5_list.append(t5)
+        clip_list.append(clip)
+    t5_embeds = torch.stack(t5_list).to(DTYPE)
+    clip_pooleds = torch.stack(clip_list).to(DTYPE)
+    x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)
+    img_ids = TinyFluxDeep.create_img_ids(B, H, W, DEVICE)
+    t_linear = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
+    timesteps = flux_shift(t_linear, s=SHIFT)
+    for i in range(num_steps):
+        t_curr = timesteps[i]
+        t_next = timesteps[i + 1]
+        dt = t_next - t_curr
+        t_batch = t_curr.expand(B).to(DTYPE)
+        with torch.autocast("cuda", dtype=DTYPE):
+            # No expert_features at inference - predictor runs standalone
+            v_cond = model(
+                hidden_states=x,
+                encoder_hidden_states=t5_embeds,
+                pooled_projections=clip_pooleds,
+                timestep=t_batch,
+                img_ids=img_ids,
+            )
+        x = x + v_cond * dt
+    latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
+    latents = latents / VAE_SCALE
+    with torch.autocast("cuda", dtype=DTYPE):
+        images = vae.decode(latents.to(vae.dtype)).sample
+    images = (images / 2 + 0.5).clamp(0, 1)
+    if use_ema and 'ema' in globals() and ema is not None:
+        ema.restore(model)
+    if was_training:
+        model.train()
+    return images
+def save_samples(images, prompts, step, output_dir):
+    from torchvision.utils import save_image
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    grid_path = os.path.join(output_dir, f"samples_step_{step}.png")
+    save_image(images, grid_path, nrow=2, padding=2)
+    try:
+        api.upload_file(
+            path_or_fileobj=grid_path,
+            path_in_repo=f"samples/{timestamp}_step_{step}.png",
+            repo_id=HF_REPO,
+        )
+    except:
+        pass
+# ============================================================================
+# CHECKPOINT FUNCTIONS
+# ============================================================================
+def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path, ema=None):
+    os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+    if hasattr(model, '_orig_mod'):
+        state_dict = model._orig_mod.state_dict()
+    else:
+        state_dict = model.state_dict()
+    state_dict = {k: v.to(DTYPE) if v.is_floating_point() else v for k, v in state_dict.items()}
+    weights_path = path.replace(".pt", ".safetensors")
+    save_file(state_dict, weights_path)
+    if ema is not None:
+        ema_weights = {k: v.to(DTYPE) if v.is_floating_point() else v for k, v in ema.shadow.items()}
+        ema_weights_path = path.replace(".pt", "_ema.safetensors")
+        save_file(ema_weights, ema_weights_path)
+    state = {
+        "step": step,
+        "epoch": epoch,
+        "loss": loss,
+        "optimizer": optimizer.state_dict(),
+        "scheduler": scheduler.state_dict(),
+    }
+    if ema is not None:
+        state["ema_decay"] = ema.decay
+    torch.save(state, path)
+    print(f"  ✓ Saved checkpoint: step {step}")
+    return weights_path
+def upload_checkpoint(weights_path, step):
+    try:
+        api.upload_file(
+            path_or_fileobj=weights_path,
+            path_in_repo=f"checkpoints/step_{step}.safetensors",
+            repo_id=HF_REPO,
+        )
+        ema_path = weights_path.replace(".safetensors", "_ema.safetensors")
+        if os.path.exists(ema_path):
+            api.upload_file(
+                path_or_fileobj=ema_path,
+                path_in_repo=f"checkpoints/step_{step}_ema.safetensors",
+                repo_id=HF_REPO,
+            )
+        print(f"  ✓ Uploaded checkpoint to {HF_REPO}")
+    except Exception as e:
+        print(f"  ⚠ Upload failed: {e}")
+def load_with_weight_upgrade(model, state_dict):
+    """
+    Load state dict with automatic handling of:
+      - Missing ExpertPredictor weights → initialize fresh
+      - Missing Q/K norm weights → initialize to ones (identity)
+      - Unexpected keys → ignore (e.g., old guidance_in, sin_basis caches)
+    """
+    model_state = model.state_dict()
+    # Patterns for new weights that may not exist in old checkpoints
+    NEW_WEIGHT_PATTERNS = [
+        'expert_predictor.',  # New ExpertPredictor module
+        '.norm_q.weight',
+        '.norm_k.weight',
+        '.norm_added_q.weight',
+        '.norm_added_k.weight',
+    ]
+    # Keys that may exist in old checkpoints but not new model
+    DEPRECATED_PATTERNS = [
+        'guidance_in.',  # Replaced by expert_predictor
+        '.sin_basis',    # Old cached sin embeddings
+    ]
+    loaded_keys = []
+    missing_keys = []
+    unexpected_keys = []
+    initialized_keys = []
+    # First pass: load matching weights
+    for key in state_dict.keys():
+        if key in model_state:
+            if state_dict[key].shape == model_state[key].shape:
+                model_state[key] = state_dict[key]
+                loaded_keys.append(key)
+            else:
+                print(f"  ⚠ Shape mismatch for {key}: checkpoint {state_dict[key].shape} vs model {model_state[key].shape}")
+                unexpected_keys.append(key)
+        else:
+            is_deprecated = any(pat in key for pat in DEPRECATED_PATTERNS)
+            if is_deprecated:
+                unexpected_keys.append(key)
+            else:
+                print(f"  ⚠ Unexpected key (not in model): {key}")
+                unexpected_keys.append(key)
+    # Second pass: handle missing keys
+    for key in model_state.keys():
+        if key not in loaded_keys:
+            is_new = any(pat in key for pat in NEW_WEIGHT_PATTERNS)
+            if is_new:
+                # Keep default initialization for new modules
+                initialized_keys.append(key)
+            else:
+                missing_keys.append(key)
+                print(f"  ⚠ Missing key (not in checkpoint): {key}")
+    # Load the updated state
+    model.load_state_dict(model_state, strict=False)
+    # Report
+    if initialized_keys:
+        # Group by module for cleaner output
+        modules = set()
+        for k in initialized_keys:
+            parts = k.split('.')
+            if len(parts) >= 2:
+                modules.add(parts[0] + '.' + parts[1] if parts[0] == 'expert_predictor' else parts[0])
+        print(f"  ✓ Initialized new modules (fresh): {sorted(modules)}")
+    if unexpected_keys:
+        deprecated = [k for k in unexpected_keys if any(p in k for p in DEPRECATED_PATTERNS)]
+        if deprecated:
+            print(f"  ✓ Ignored deprecated keys: {len(deprecated)} (guidance_in, etc)")
+    return missing_keys, unexpected_keys
+def load_checkpoint(model, optimizer, scheduler, target):
+    """
+    Load checkpoint with weight upgrade support for ExpertPredictor.
+    When ALLOW_WEIGHT_UPGRADE=True:
+      - Missing ExpertPredictor weights are initialized fresh
+      - Old guidance_in weights are ignored
+      - Model continues training with new architecture
+    """
+    start_step = 0
+    start_epoch = 0
+    ema_state = None
+    if target == "none":
+        print("Starting fresh (no checkpoint)")
+        return start_step, start_epoch, None
+    ckpt_path = None
+    weights_path = None
+    ema_weights_path = None
+    if target == "latest":
+        if os.path.exists(CHECKPOINT_DIR):
+            ckpts = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith("step_") and f.endswith(".pt")]
+            if ckpts:
+                steps = [int(f.split("_")[1].split(".")[0]) for f in ckpts]
+                latest_step = max(steps)
+                ckpt_path = os.path.join(CHECKPOINT_DIR, f"step_{latest_step}.pt")
+                weights_path = ckpt_path.replace(".pt", ".safetensors")
+                ema_weights_path = ckpt_path.replace(".pt", "_ema.safetensors")
+    elif target == "hub" or target.startswith("hub:"):
+        try:
+            from huggingface_hub import list_repo_files
+            if target.startswith("hub:"):
+                step_name = target.split(":")[1]
+                weights_path = hf_hub_download(HF_REPO, f"checkpoints/{step_name}.safetensors")
+                try:
+                    ema_weights_path = hf_hub_download(HF_REPO, f"checkpoints/{step_name}_ema.safetensors")
+                    print(f"  Found EMA weights on hub")
+                except:
+                    ema_weights_path = None
+                    print(f"  No EMA weights on hub (will start fresh)")
+                start_step = int(step_name.split("_")[1]) if "_" in step_name else 0
+                print(f"Downloaded {step_name} from hub")
+            else:
+                files = list_repo_files(HF_REPO)
+                ckpts = [f for f in files if f.startswith("checkpoints/step_") and f.endswith(".safetensors") and "_ema" not in f]
+                if ckpts:
+                    steps = [int(f.split("_")[1].split(".")[0]) for f in ckpts]
+                    latest = max(steps)
+                    weights_path = hf_hub_download(HF_REPO, f"checkpoints/step_{latest}.safetensors")
+                    try:
+                        ema_weights_path = hf_hub_download(HF_REPO, f"checkpoints/step_{latest}_ema.safetensors")
+                        print(f"  Found EMA weights on hub")
+                    except:
+                        ema_weights_path = None
+                        print(f"  No EMA weights on hub (will start fresh)")
+                    start_step = latest
+                    print(f"Downloaded step_{latest} from hub")
+        except Exception as e:
+            print(f"Could not download from hub: {e}")
+            return start_step, start_epoch, None
+    elif target == "best":
+        ckpt_path = os.path.join(CHECKPOINT_DIR, "best.pt")
+        weights_path = ckpt_path.replace(".pt", ".safetensors")
+        ema_weights_path = ckpt_path.replace(".pt", "_ema.safetensors")
+    elif os.path.exists(target):
+        if target.endswith(".safetensors"):
+            weights_path = target
+            ckpt_path = target.replace(".safetensors", ".pt")
+            ema_weights_path = target.replace(".safetensors", "_ema.safetensors")
+        else:
+            ckpt_path = target
+            weights_path = target.replace(".pt", ".safetensors")
+            ema_weights_path = target.replace(".pt", "_ema.safetensors")
+    # Load main model weights
+    if weights_path and os.path.exists(weights_path):
+        print(f"Loading weights from {weights_path}")
+        state_dict = load_file(weights_path)
+        state_dict = {k: v.to(DTYPE) if v.is_floating_point() else v for k, v in state_dict.items()}
+        # Get model reference (handle torch.compile wrapper)
+        model_ref = model._orig_mod if hasattr(model, '_orig_mod') else model
+        if ALLOW_WEIGHT_UPGRADE:
+            # Flexible loading with weight upgrade
+            missing, unexpected = load_with_weight_upgrade(model_ref, state_dict)
+            if missing:
+                print(f"  ⚠ {len(missing)} truly missing parameters (may need attention)")
+        else:
+            # Strict loading - must match exactly
+            model_ref.load_state_dict(state_dict, strict=True)
+        print(f"✓ Loaded model weights")
+        # Load EMA weights if they exist
+        if ema_weights_path and os.path.exists(ema_weights_path):
+            ema_state = load_file(ema_weights_path)
+            ema_state = {k: v.to(DTYPE) if v.is_floating_point() else v for k, v in ema_state.items()}
+            print(f"✓ Loaded EMA weights ({len(ema_state)} params)")
+        else:
+            print(f"  ℹ No EMA weights found (will initialize fresh)")
+    else:
+        print(f"  ⚠ Weights file not found: {weights_path}")
+        print(f"  Starting with fresh model")
+        return start_step, start_epoch, None
+    # Load optimizer/scheduler state
+    if ckpt_path and os.path.exists(ckpt_path):
+        state = torch.load(ckpt_path, map_location="cpu")
+        start_step = state.get("step", 0)
+        start_epoch = state.get("epoch", 0)
+        try:
+            optimizer.load_state_dict(state["optimizer"])
+            scheduler.load_state_dict(state["scheduler"])
+            print(f"✓ Loaded optimizer/scheduler state")
+        except Exception as e:
+            print(f"  ⚠ Could not load optimizer state: {e}")
+            print(f"  Will use fresh optimizer (this is fine for architecture changes)")
+        print(f"Resuming from step {start_step}, epoch {start_epoch}")
+    return start_step, start_epoch, ema_state
+# ============================================================================
+# CREATE MODEL
+# ============================================================================
+print("\nCreating TinyFluxDeep model with ExpertPredictor...")
+config = TinyFluxDeepConfig(
+    use_expert_predictor=ENABLE_EXPERT_DISTILLATION,
+    expert_dim=EXPERT_DIM,
+    expert_hidden_dim=EXPERT_HIDDEN_DIM,
+    expert_dropout=EXPERT_DROPOUT,
+    guidance_embeds=False,
+)
+model = TinyFluxDeep(config).to(device=DEVICE, dtype=DTYPE)
+total_params = sum(p.numel() for p in model.parameters())
+print(f"Total parameters: {total_params:,}")
+if hasattr(model, 'expert_predictor') and model.expert_predictor is not None:
+    expert_params = sum(p.numel() for p in model.expert_predictor.parameters())
+    print(f"Expert predictor parameters: {expert_params:,}")
+trainable_params = [p for p in model.parameters() if p.requires_grad]
+print(f"Trainable parameters: {sum(p.numel() for p in trainable_params):,}")
+# ============================================================================
+# OPTIMIZER
+# ============================================================================
+opt = torch.optim.AdamW(trainable_params, lr=LR, betas=(0.9, 0.99), weight_decay=0.01, fused=True)
+total_steps = len(loader) * EPOCHS // GRAD_ACCUM
+warmup = min(1000, total_steps // 10)
+def lr_fn(step):
+    if step < warmup:
+        return step / warmup
+    return 0.5 * (1 + math.cos(math.pi * (step - warmup) / (total_steps - warmup)))
+sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_fn)
+# ============================================================================
+# LOAD CHECKPOINT
+# ============================================================================
+start_step, start_epoch, ema_state = load_checkpoint(model, opt, sched, LOAD_TARGET)
+if RESUME_STEP is not None:
+    start_step = RESUME_STEP
+# ============================================================================
+# COMPILE
+# ============================================================================
+model = torch.compile(model, mode="default")
+# ============================================================================
+# EMA
+# ============================================================================
+print("Initializing EMA...")
+ema = EMA(model, decay=EMA_DECAY)
+if ema_state is not None:
+    ema.load_shadow(ema_state)
+else:
+    print("  Starting fresh EMA from current weights")
+# ============================================================================
+# TENSORBOARD
+# ============================================================================
+run_name = f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+writer = SummaryWriter(os.path.join(LOG_DIR, run_name))
+SAMPLE_PROMPTS = [
+    "a photo of a cat sitting on a windowsill",
+    "a portrait of a woman with red hair",
+    "a black backpack on white background",
+    "a person standing in a t-pose",
+]
+# ============================================================================
+# DISTILLATION WEIGHT SCHEDULE
+# ============================================================================
+def get_distill_weight(step):
+    if step < DISTILL_WARMUP_STEPS:
+        return DISTILL_LOSS_WEIGHT * (step / DISTILL_WARMUP_STEPS)
+    return DISTILL_LOSS_WEIGHT
+# ============================================================================
+# TRAINING LOOP
+# ============================================================================
+print(f"\n{'='*60}")
+print(f"Training TinyFlux-Deep with Expert Distillation (Precached)")
+print(f"{'='*60}")
+print(f"Total: {len(combined_ds):,} samples")
+print(f"Epochs: {EPOCHS}, Steps/epoch: {len(loader)}, Total: {total_steps}")
+print(f"Batch: {BATCH_SIZE} x {GRAD_ACCUM} = {BATCH_SIZE * GRAD_ACCUM}")
+print(f"Expert distillation: {ENABLE_EXPERT_DISTILLATION} (PRECACHED)")
+if ENABLE_EXPERT_DISTILLATION:
+    print(f"  - Expert: {EXPERT_CHECKPOINT}")
+    print(f"  - Timestep buckets: {len(EXPERT_T_BUCKETS)}")
+    print(f"  - Distill weight: {DISTILL_LOSS_WEIGHT} (warmup: {DISTILL_WARMUP_STEPS} steps)")
+    print(f"  - Expert dropout: {EXPERT_DROPOUT}")
+print(f"Masked loss: {USE_MASKED_LOSS}")
+print(f"Min-SNR gamma: {MIN_SNR_GAMMA}")
+print(f"Resume: step {start_step}, epoch {start_epoch}")
+model.train()
+step = start_step
+best = float("inf")
+for ep in range(start_epoch, EPOCHS):
+    ep_loss = 0
+    ep_main_loss = 0
+    ep_distill_loss = 0
+    ep_batches = 0
+    pbar = tqdm(loader, desc=f"E{ep + 1}")
+    for i, batch in enumerate(pbar):
+        latents = batch["latents"].to(DEVICE, non_blocking=True)
+        t5 = batch["t5_embeds"].to(DEVICE, non_blocking=True)
+        clip = batch["clip_pooled"].to(DEVICE, non_blocking=True)
+        local_indices = batch["local_indices"]
+        dataset_ids = batch["dataset_ids"]
+        masks = batch["masks"]
+        if masks is not None:
+            masks = masks.to(DEVICE, non_blocking=True)
+        B, C, H, W = latents.shape
+        data = latents.permute(0, 2, 3, 1).reshape(B, H * W, C)
+        noise = torch.randn_like(data)
+        if TEXT_DROPOUT > 0:
+            t5, clip, _ = apply_text_dropout(t5, clip, TEXT_DROPOUT)
+        t = torch.sigmoid(torch.randn(B, device=DEVICE))
+        t = flux_shift(t, s=SHIFT).to(DTYPE).clamp(1e-4, 1 - 1e-4)
+        t_expanded = t.view(B, 1, 1)
+        x_t = (1 - t_expanded) * noise + t_expanded * data
+        v_target = data - noise
+        img_ids = TinyFluxDeep.create_img_ids(B, H, W, DEVICE)
+        # Get expert features from CACHE (fast!)
+        expert_features = None
+        if ENABLE_EXPERT_DISTILLATION:
+            expert_features = get_expert_features_for_batch(
+                local_indices, dataset_ids, t,
+                portrait_expert_cache, schnell_expert_cache,
+                sportfashion_expert_cache, synthmocap_expert_cache,
+            )
+            # Apply dropout OUTSIDE model (no graph break)
+            if expert_features is not None and random.random() < EXPERT_DROPOUT:
+                expert_features = None
+        with torch.autocast("cuda", dtype=DTYPE):
+            v_pred, expert_info = model(
+                hidden_states=x_t,
+                encoder_hidden_states=t5,
+                pooled_projections=clip,
+                timestep=t,
+                img_ids=img_ids,
+                expert_features=expert_features,
+                return_expert_pred=True,
+            )
+        # Compute losses
+        snr_weights = min_snr_weight(t)
+        main_loss = masked_mse_loss(
+            v_pred, v_target,
+            mask=masks if USE_MASKED_LOSS else None,
+            fg_weight=FG_LOSS_WEIGHT,
+            bg_weight=BG_LOSS_WEIGHT,
+            snr_weights=snr_weights
+        )
+        # Distillation loss
+        distill_loss = torch.tensor(0.0, device=DEVICE)
+        if expert_features is not None and expert_info is not None and 'expert_pred' in expert_info:
+            distill_weight = get_distill_weight(step)
+            distill_loss = F.mse_loss(expert_info['expert_pred'], expert_features)
+            total_loss = main_loss + distill_weight * distill_loss
+        else:
+            total_loss = main_loss
+        loss = total_loss / GRAD_ACCUM
+        loss.backward()
+        if (i + 1) % GRAD_ACCUM == 0:
+            grad_norm = torch.nn.utils.clip_grad_norm_(trainable_params, 1.0)
+            opt.step()
+            sched.step()
+            opt.zero_grad(set_to_none=True)
+            ema.update(model)
+            step += 1
+            if step % LOG_EVERY == 0:
+                writer.add_scalar("train/loss", total_loss.item(), step)
+                writer.add_scalar("train/main_loss", main_loss.item(), step)
+                if ENABLE_EXPERT_DISTILLATION:
+                    writer.add_scalar("train/distill_loss", distill_loss.item(), step)
+                    writer.add_scalar("train/distill_weight", get_distill_weight(step), step)
+                writer.add_scalar("train/lr", sched.get_last_lr()[0], step)
+                writer.add_scalar("train/grad_norm", grad_norm.item(), step)
+            if step % SAMPLE_EVERY == 0:
+                print(f"\n  Generating samples at step {step}...")
+                images = generate_samples(model, SAMPLE_PROMPTS, num_steps=20, use_ema=True)
+                save_samples(images, SAMPLE_PROMPTS, step, SAMPLE_DIR)
+            if step % SAVE_EVERY == 0:
+                ckpt_path = os.path.join(CHECKPOINT_DIR, f"step_{step}.pt")
+                weights_path = save_checkpoint(model, opt, sched, step, ep, total_loss.item(), ckpt_path, ema=ema)
+                if step % UPLOAD_EVERY == 0:
+                    upload_checkpoint(weights_path, step)
+        ep_loss += total_loss.item()
+        ep_main_loss += main_loss.item()
+        ep_distill_loss += distill_loss.item()
+        ep_batches += 1
+        pbar.set_postfix(
+            loss=f"{total_loss.item():.4f}",
+            main=f"{main_loss.item():.4f}",
+            dist=f"{distill_loss.item():.4f}" if ENABLE_EXPERT_DISTILLATION else "off",
+            step=step
+        )
+    avg = ep_loss / max(ep_batches, 1)
+    avg_main = ep_main_loss / max(ep_batches, 1)
+    avg_distill = ep_distill_loss / max(ep_batches, 1)
+    print(f"Epoch {ep + 1} - total: {avg:.4f}, main: {avg_main:.4f}, distill: {avg_distill:.4f}")
+    if avg < best:
+        best = avg
+        weights_path = save_checkpoint(model, opt, sched, step, ep, avg, os.path.join(CHECKPOINT_DIR, "best.pt"), ema=ema)
+        try:
+            api.upload_file(path_or_fileobj=weights_path, path_in_repo="model.safetensors", repo_id=HF_REPO)
+        except:
+            pass
+print(f"\n✓ Training complete! Best loss: {best:.4f}")
+writer.close()