AbstractPhil
/

tiny-flux

@@ -36,9 +36,9 @@ DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
 # HuggingFace Hub
 HF_REPO = "AbstractPhil/tiny-flux"
-SAVE_EVERY = 500      # steps - local save
-UPLOAD_EVERY = 500    # steps - hub upload
-SAMPLE_EVERY = 250    # steps - generate samples
 LOG_EVERY = 10        # steps - tensorboard
 # Checkpoint loading target
@@ -47,10 +47,14 @@ LOG_EVERY = 10        # steps - tensorboard
 #   "best" - load best model
 #   int (e.g. 1500) - load specific step
 #   "hub:step_1000" - load specific checkpoint from hub
-#   "local:path/to/checkpoint.safetensors" - load specific local file
 #   "none" - start fresh, ignore existing checkpoints
 LOAD_TARGET = "latest"
 # Local paths
 CHECKPOINT_DIR = "./tiny_flux_checkpoints"
 LOG_DIR = "./tiny_flux_logs"
@@ -127,14 +131,37 @@ def encode_prompt(prompt):
 # ============================================================================
 # FLOW MATCHING HELPERS
 # ============================================================================
 def flux_shift(t, s=SHIFT):
     return s * t / (1 + (s - 1) * t)
 def flux_shift_inverse(t_shifted, s=SHIFT):
-    """Inverse of flux_shift for sampling."""
     return t_shifted / (s - (s - 1) * t_shifted)
 def min_snr_weight(t, gamma=MIN_SNR):
     snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
     return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)
@@ -143,7 +170,12 @@ def min_snr_weight(t, gamma=MIN_SNR):
 # ============================================================================
 @torch.no_grad()
 def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=64):
-    """Generate sample images using Euler sampling."""
     model.eval()
     B = len(prompts)
     C = 16  # VAE channels
@@ -157,18 +189,21 @@ def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=6
     t5_embeds = torch.stack(t5_embeds)
     clip_pooleds = torch.stack(clip_pooleds)
-    # Start from noise
     x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)
     # Create image IDs
     img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
-    # Euler sampling with uniform timesteps
-    timesteps = torch.linspace(1, 0, num_steps + 1, device=DEVICE)[:-1]
-    for i, t in enumerate(timesteps):
-        t_batch = t.expand(B)
-        dt = 1.0 / num_steps
         # Conditional prediction
         guidance = torch.full((B,), guidance_scale, device=DEVICE, dtype=DTYPE)
@@ -181,15 +216,15 @@ def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=6
             guidance=guidance,
         )
-        # Euler step: x = x + v * dt (going from noise to data)
         x = x + v_cond * dt
     # Reshape to image format: (B, H*W, C) -> (B, C, H, W)
     latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
-    # Decode with VAE
     latents = latents / vae.config.scaling_factor
-    images = vae.decode(latents.float()).sample
     images = (images / 2 + 0.5).clamp(0, 1)
     model.train()
@@ -235,6 +270,32 @@ def collate(batch):
 # ============================================================================
 # CHECKPOINT FUNCTIONS
 # ============================================================================
 def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path):
     """Save checkpoint locally."""
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
@@ -306,7 +367,7 @@ def load_checkpoint(model, optimizer, scheduler, target):
             "best" - best model
             int (1500) - specific step
             "hub:step_1000" - specific hub checkpoint
-            "local:/path/to/file.safetensors" - specific local file
             "none" - skip loading, start fresh
     """
     if target == "none":
@@ -342,102 +403,163 @@ def load_checkpoint(model, optimizer, scheduler, target):
     # Load based on mode
     if load_mode == "local":
-        # Direct local file
         if os.path.exists(load_path):
-            weights = load_file(load_path)
             model.load_state_dict(weights)
-            # Try to find associated state file
-            state_path = load_path.replace(".safetensors", ".pt")
-            if os.path.exists(state_path):
-                state = torch.load(state_path, weights_only=False)
-                optimizer.load_state_dict(state["optimizer"])
-                scheduler.load_state_dict(state["scheduler"])
-                start_step = state.get("step", 0)
-                start_epoch = state.get("epoch", 0)
             print(f"✓ Loaded local: {load_path} (step {start_step})")
             return start_step, start_epoch
         else:
             print(f"⚠ Local file not found: {load_path}")
     elif load_mode == "hub":
-        # Specific hub checkpoint
-        try:
-            filename = f"checkpoints/{load_path}.safetensors" if not load_path.endswith(".safetensors") else load_path
-            local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
-            weights = load_file(local_path)
-            model.load_state_dict(weights)
-            # Extract step from filename
-            if "step_" in load_path:
-                start_step = int(load_path.split("step_")[-1].replace(".safetensors", ""))
-            print(f"✓ Loaded from Hub: {filename} (step {start_step})")
-            return start_step, start_epoch
-        except Exception as e:
-            print(f"⚠ Hub load failed: {e}")
     elif load_mode == "best":
-        # Try hub best first
-        try:
-            local_path = hf_hub_download(repo_id=HF_REPO, filename="model.safetensors")
-            weights = load_file(local_path)
-            model.load_state_dict(weights)
-            print(f"✓ Loaded best model from Hub")
-            return start_step, start_epoch
-        except:
-            pass
-        # Try local best
-        best_path = os.path.join(CHECKPOINT_DIR, "best.safetensors")
-        if os.path.exists(best_path):
-            weights = load_file(best_path)
-            model.load_state_dict(weights)
-            state_path = best_path.replace(".safetensors", ".pt")
-            if os.path.exists(state_path):
-                state = torch.load(state_path, weights_only=False)
-                start_step = state.get("step", 0)
-                start_epoch = state.get("epoch", 0)
-            print(f"✓ Loaded local best (step {start_step})")
-            return start_step, start_epoch
     elif load_mode == "step":
         # Specific step number
         step_num = load_path
-        # Try hub
-        try:
-            filename = f"checkpoints/step_{step_num}.safetensors"
-            local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
-            weights = load_file(local_path)
-            model.load_state_dict(weights)
-            start_step = step_num
-            print(f"✓ Loaded step {step_num} from Hub")
-            return start_step, start_epoch
-        except:
-            pass
-        # Try local
-        local_path = os.path.join(CHECKPOINT_DIR, f"step_{step_num}.safetensors")
-        if os.path.exists(local_path):
-            weights = load_file(local_path)
-            model.load_state_dict(weights)
-            state_path = local_path.replace(".safetensors", ".pt")
-            if os.path.exists(state_path):
-                state = torch.load(state_path, weights_only=False)
-                optimizer.load_state_dict(state["optimizer"])
-                scheduler.load_state_dict(state["scheduler"])
-                start_epoch = state.get("epoch", 0)
-            start_step = step_num
-            print(f"✓ Loaded local step {step_num}")
-            return start_step, start_epoch
         print(f"⚠ Step {step_num} not found")
     # Default: latest
-    # Try Hub first
     try:
         files = api.list_repo_files(repo_id=HF_REPO)
-        checkpoints = [f for f in files if f.startswith("checkpoints/step_") and f.endswith(".safetensors")]
         if checkpoints:
-            checkpoints.sort(key=lambda x: int(x.split("step_")[-1].replace(".safetensors", "")))
             latest = checkpoints[-1]
-            step = int(latest.split("step_")[-1].replace(".safetensors", ""))
             local_path = hf_hub_download(repo_id=HF_REPO, filename=latest)
-            weights = load_file(local_path)
             model.load_state_dict(weights)
             start_step = step
             print(f"✓ Loaded latest from Hub: step {step}")
@@ -445,22 +567,33 @@ def load_checkpoint(model, optimizer, scheduler, target):
     except Exception as e:
         print(f"Hub check: {e}")
-    # Try local
     if os.path.exists(CHECKPOINT_DIR):
-        local_ckpts = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith("step_") and f.endswith(".safetensors")]
         if local_ckpts:
-            local_ckpts.sort(key=lambda x: int(x.split("step_")[-1].replace(".safetensors", "")))
             latest = local_ckpts[-1]
-            step = int(latest.split("step_")[-1].replace(".safetensors", ""))
             weights_path = os.path.join(CHECKPOINT_DIR, latest)
-            weights = load_file(weights_path)
             model.load_state_dict(weights)
-            state_path = weights_path.replace(".safetensors", ".pt")
             if os.path.exists(state_path):
-                state = torch.load(state_path, weights_only=False)
-                optimizer.load_state_dict(state["optimizer"])
-                scheduler.load_state_dict(state["scheduler"])
-                start_epoch = state.get("epoch", 0)
             start_step = step
             print(f"✓ Loaded latest local: step {step}")
             return start_step, start_epoch
@@ -479,6 +612,7 @@ loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate,
 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
 print(f"\nParams: {sum(p.numel() for p in model.parameters()):,}")
 # ============================================================================
 # OPTIMIZER & SCHEDULER
@@ -499,6 +633,11 @@ sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_fn)
 print(f"\nLoad target: {LOAD_TARGET}")
 start_step, start_epoch = load_checkpoint(model, opt, sched, LOAD_TARGET)
 # Log config to tensorboard
 writer.add_text("config", json.dumps(config.__dict__, indent=2), 0)
 writer.add_text("training_config", json.dumps({
@@ -537,26 +676,53 @@ for ep in range(start_epoch, EPOCHS):
     pbar = tqdm(loader, desc=f"E{ep+1}")
     for i, batch in enumerate(pbar):
-        lat = batch["latents"]
         t5 = batch["t5_embeds"]
         clip = batch["clip_pooled"]
-        B, C, H, W = lat.shape
-        x1 = lat.permute(0, 2, 3, 1).reshape(B, H*W, C)
-        x0 = torch.randn_like(x1)
         t = torch.sigmoid(torch.randn(B, device=DEVICE))
-        t = flux_shift(t).to(DTYPE).clamp(1e-4, 1-1e-4)
-        t_exp = t.view(B, 1, 1)
-        x_t = (1 - t_exp) * x0 + t_exp * x1
-        v_target = x1 - x0
         img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
-        guidance = torch.rand(B, device=DEVICE, dtype=DTYPE) * 4 + 1
         with torch.autocast("cuda", dtype=DTYPE):
-            pred = model(
                 hidden_states=x_t,
                 encoder_hidden_states=t5,
                 pooled_projections=clip,
@@ -565,7 +731,10 @@ for ep in range(start_epoch, EPOCHS):
                 guidance=guidance,
             )
-        loss_raw = F.mse_loss(pred, v_target, reduction="none").mean(dim=[1,2])
         snr_weights = min_snr_weight(t)
         loss = (loss_raw * snr_weights).mean() / GRAD_ACCUM
         loss.backward()

 # HuggingFace Hub
 HF_REPO = "AbstractPhil/tiny-flux"
+SAVE_EVERY = 1000      # steps - local save
+UPLOAD_EVERY = 1000    # steps - hub upload
+SAMPLE_EVERY = 500    # steps - generate samples
 LOG_EVERY = 10        # steps - tensorboard
 # Checkpoint loading target
 #   "best" - load best model
 #   int (e.g. 1500) - load specific step
 #   "hub:step_1000" - load specific checkpoint from hub
+#   "local:path/to/checkpoint.safetensors" or "local:path/to/checkpoint.pt"
 #   "none" - start fresh, ignore existing checkpoints
 LOAD_TARGET = "latest"
+# Manual resume step (set to override step from checkpoint, or None to use checkpoint's step)
+# Useful when checkpoint doesn't contain step info
+RESUME_STEP = None  # e.g., 5000 to resume from step 5000
 # Local paths
 CHECKPOINT_DIR = "./tiny_flux_checkpoints"
 LOG_DIR = "./tiny_flux_logs"
 # ============================================================================
 # FLOW MATCHING HELPERS
 # ============================================================================
+# Rectified Flow / Flow Matching formulation:
+#   x_t = (1-t) * x_0 + t * x_1
+#   where x_0 = noise, x_1 = data
+#   t=0: pure noise, t=1: pure data
+#   velocity v = x_1 - x_0 = data - noise
+#
+# Training: model learns to predict v given (x_t, t)
+# Inference: start from noise (t=0), integrate to data (t=1)
+#   x_{t+dt} = x_t + v_pred * dt
+# ============================================================================
 def flux_shift(t, s=SHIFT):
+    """Flux timestep shift for training distribution.
+    Shifts timesteps towards higher values (closer to data),
+    making training focus more on refining details.
+    s=3.0 (default): flux_shift(0.5) ≈ 0.75
+    """
     return s * t / (1 + (s - 1) * t)
 def flux_shift_inverse(t_shifted, s=SHIFT):
+    """Inverse of flux_shift."""
     return t_shifted / (s - (s - 1) * t_shifted)
 def min_snr_weight(t, gamma=MIN_SNR):
+    """Min-SNR weighting to balance loss across timesteps.
+    Downweights very easy timesteps (near t=0 or t=1).
+    gamma=5.0 is typical.
+    """
     snr = (t / (1 - t).clamp(min=1e-5)).pow(2)
     return torch.clamp(snr, max=gamma) / snr.clamp(min=1e-5)
 # ============================================================================
 @torch.no_grad()
 def generate_samples(model, prompts, num_steps=20, guidance_scale=3.5, H=64, W=64):
+    """Generate sample images using Euler sampling.
+    Flow matching: x_t = (1-t)*noise + t*data, v = data - noise
+    At t=0: pure noise. At t=1: pure data.
+    We integrate from t=0 to t=1.
+    """
     model.eval()
     B = len(prompts)
     C = 16  # VAE channels
     t5_embeds = torch.stack(t5_embeds)
     clip_pooleds = torch.stack(clip_pooleds)
+    # Start from pure noise (t=0)
     x = torch.randn(B, H * W, C, device=DEVICE, dtype=DTYPE)
     # Create image IDs
     img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
+    # Euler sampling: t goes from 0 (noise) to 1 (data)
+    timesteps = torch.linspace(0, 1, num_steps + 1, device=DEVICE, dtype=DTYPE)
+    for i in range(num_steps):
+        t_curr = timesteps[i]
+        t_next = timesteps[i + 1]
+        dt = t_next - t_curr  # positive
+        t_batch = t_curr.expand(B)
         # Conditional prediction
         guidance = torch.full((B,), guidance_scale, device=DEVICE, dtype=DTYPE)
             guidance=guidance,
         )
+        # Euler step: x_{t+dt} = x_t + v * dt
         x = x + v_cond * dt
     # Reshape to image format: (B, H*W, C) -> (B, C, H, W)
     latents = x.reshape(B, H, W, C).permute(0, 3, 1, 2)
+    # Decode with VAE (match VAE dtype)
     latents = latents / vae.config.scaling_factor
+    images = vae.decode(latents.to(vae.dtype)).sample
     images = (images / 2 + 0.5).clamp(0, 1)
     model.train()
 # ============================================================================
 # CHECKPOINT FUNCTIONS
 # ============================================================================
+def load_weights(path):
+    """Load weights from .safetensors or .pt file."""
+    if path.endswith(".safetensors"):
+        return load_file(path)
+    elif path.endswith(".pt"):
+        ckpt = torch.load(path, map_location=DEVICE, weights_only=False)
+        if isinstance(ckpt, dict):
+            if "model" in ckpt:
+                return ckpt["model"]
+            elif "state_dict" in ckpt:
+                return ckpt["state_dict"]
+            else:
+                # Check if it looks like a state dict (has tensor values)
+                first_val = next(iter(ckpt.values()), None)
+                if isinstance(first_val, torch.Tensor):
+                    return ckpt
+                # Otherwise might have optimizer etc, look for model keys
+                return ckpt
+        return ckpt
+    else:
+        # Try safetensors first, then pt
+        try:
+            return load_file(path)
+        except:
+            return torch.load(path, map_location=DEVICE, weights_only=False)
 def save_checkpoint(model, optimizer, scheduler, step, epoch, loss, path):
     """Save checkpoint locally."""
     os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
             "best" - best model
             int (1500) - specific step
             "hub:step_1000" - specific hub checkpoint
+            "local:/path/to/file.safetensors" or "local:/path/to/file.pt" - specific local file
             "none" - skip loading, start fresh
     """
     if target == "none":
     # Load based on mode
     if load_mode == "local":
+        # Direct local file (.pt or .safetensors)
         if os.path.exists(load_path):
+            weights = load_weights(load_path)
             model.load_state_dict(weights)
+            # Try to find associated state file for optimizer/scheduler
+            if load_path.endswith(".safetensors"):
+                state_path = load_path.replace(".safetensors", ".pt")
+            elif load_path.endswith(".pt"):
+                # The .pt file might contain everything
+                ckpt = torch.load(load_path, map_location=DEVICE, weights_only=False)
+                if isinstance(ckpt, dict):
+                    # Debug: show what keys are in the checkpoint
+                    non_tensor_keys = [k for k in ckpt.keys() if not isinstance(ckpt.get(k), torch.Tensor)]
+                    if non_tensor_keys:
+                        print(f"  Checkpoint keys: {non_tensor_keys}")
+                    # Extract step/epoch - try multiple common key names
+                    start_step = ckpt.get("step", ckpt.get("global_step", ckpt.get("iteration", 0)))
+                    start_epoch = ckpt.get("epoch", 0)
+                    # Also check for nested state dict
+                    if "state" in ckpt and isinstance(ckpt["state"], dict):
+                        start_step = ckpt["state"].get("step", start_step)
+                        start_epoch = ckpt["state"].get("epoch", start_epoch)
+                    # Try to load optimizer/scheduler if present
+                    if "optimizer" in ckpt:
+                        try:
+                            optimizer.load_state_dict(ckpt["optimizer"])
+                            if "scheduler" in ckpt:
+                                scheduler.load_state_dict(ckpt["scheduler"])
+                        except Exception as e:
+                            print(f"  Note: Could not load optimizer state: {e}")
+                state_path = None
+            else:
+                state_path = load_path + ".pt"
+            if state_path and os.path.exists(state_path):
+                state = torch.load(state_path, map_location=DEVICE, weights_only=False)
+                try:
+                    start_step = state.get("step", start_step)
+                    start_epoch = state.get("epoch", start_epoch)
+                    if "optimizer" in state:
+                        optimizer.load_state_dict(state["optimizer"])
+                    if "scheduler" in state:
+                        scheduler.load_state_dict(state["scheduler"])
+                except Exception as e:
+                    print(f"  Note: Could not load optimizer state: {e}")
             print(f"✓ Loaded local: {load_path} (step {start_step})")
             return start_step, start_epoch
         else:
             print(f"⚠ Local file not found: {load_path}")
     elif load_mode == "hub":
+        # Specific hub checkpoint - try both extensions
+        for ext in [".safetensors", ".pt", ""]:
+            try:
+                if load_path.endswith((".safetensors", ".pt")):
+                    filename = load_path if "/" in load_path else f"checkpoints/{load_path}"
+                else:
+                    filename = f"checkpoints/{load_path}{ext}"
+                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
+                weights = load_weights(local_path)
+                model.load_state_dict(weights)
+                # Extract step from filename
+                if "step_" in load_path:
+                    start_step = int(load_path.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
+                print(f"✓ Loaded from Hub: {filename} (step {start_step})")
+                return start_step, start_epoch
+            except Exception as e:
+                continue
+        print(f"⚠ Could not load from hub: {load_path}")
     elif load_mode == "best":
+        # Try hub best first (try both extensions)
+        for ext in [".safetensors", ".pt"]:
+            try:
+                filename = f"model{ext}" if ext else "model.safetensors"
+                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
+                weights = load_weights(local_path)
+                model.load_state_dict(weights)
+                print(f"✓ Loaded best model from Hub")
+                return start_step, start_epoch
+            except:
+                continue
+        # Try local best (both extensions)
+        for ext in [".safetensors", ".pt"]:
+            best_path = os.path.join(CHECKPOINT_DIR, f"best{ext}")
+            if os.path.exists(best_path):
+                weights = load_weights(best_path)
+                model.load_state_dict(weights)
+                # Try to load optimizer state
+                state_path = best_path.replace(ext, ".pt") if ext == ".safetensors" else best_path
+                if os.path.exists(state_path):
+                    state = torch.load(state_path, map_location=DEVICE, weights_only=False)
+                    if isinstance(state, dict) and "step" in state:
+                        start_step = state.get("step", 0)
+                        start_epoch = state.get("epoch", 0)
+                print(f"✓ Loaded local best (step {start_step})")
+                return start_step, start_epoch
     elif load_mode == "step":
         # Specific step number
         step_num = load_path
+        # Try hub (both extensions)
+        for ext in [".safetensors", ".pt"]:
+            try:
+                filename = f"checkpoints/step_{step_num}{ext}"
+                local_path = hf_hub_download(repo_id=HF_REPO, filename=filename)
+                weights = load_weights(local_path)
+                model.load_state_dict(weights)
+                start_step = step_num
+                print(f"✓ Loaded step {step_num} from Hub")
+                return start_step, start_epoch
+            except:
+                continue
+        # Try local (both extensions)
+        for ext in [".safetensors", ".pt"]:
+            local_path = os.path.join(CHECKPOINT_DIR, f"step_{step_num}{ext}")
+            if os.path.exists(local_path):
+                weights = load_weights(local_path)
+                model.load_state_dict(weights)
+                state_path = local_path.replace(".safetensors", ".pt") if ext == ".safetensors" else local_path
+                if os.path.exists(state_path):
+                    state = torch.load(state_path, map_location=DEVICE, weights_only=False)
+                    if isinstance(state, dict):
+                        try:
+                            if "optimizer" in state:
+                                optimizer.load_state_dict(state["optimizer"])
+                            if "scheduler" in state:
+                                scheduler.load_state_dict(state["scheduler"])
+                            start_epoch = state.get("epoch", 0)
+                        except:
+                            pass
+                start_step = step_num
+                print(f"✓ Loaded local step {step_num}")
+                return start_step, start_epoch
         print(f"⚠ Step {step_num} not found")
     # Default: latest
+    # Try Hub first (both extensions)
     try:
         files = api.list_repo_files(repo_id=HF_REPO)
+        checkpoints = [f for f in files if f.startswith("checkpoints/step_") and (f.endswith(".safetensors") or f.endswith(".pt"))]
         if checkpoints:
+            # Sort by step number
+            def get_step(f):
+                return int(f.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
+            checkpoints.sort(key=get_step)
             latest = checkpoints[-1]
+            step = get_step(latest)
             local_path = hf_hub_download(repo_id=HF_REPO, filename=latest)
+            weights = load_weights(local_path)
             model.load_state_dict(weights)
             start_step = step
             print(f"✓ Loaded latest from Hub: step {step}")
     except Exception as e:
         print(f"Hub check: {e}")
+    # Try local (both extensions)
     if os.path.exists(CHECKPOINT_DIR):
+        local_ckpts = [f for f in os.listdir(CHECKPOINT_DIR) if f.startswith("step_") and (f.endswith(".safetensors") or f.endswith(".pt"))]
+        # Filter to just weights files (not state .pt files that pair with .safetensors)
+        local_ckpts = [f for f in local_ckpts if not (f.endswith(".pt") and f.replace(".pt", ".safetensors") in local_ckpts)]
         if local_ckpts:
+            def get_step(f):
+                return int(f.split("step_")[-1].replace(".safetensors", "").replace(".pt", ""))
+            local_ckpts.sort(key=get_step)
             latest = local_ckpts[-1]
+            step = get_step(latest)
             weights_path = os.path.join(CHECKPOINT_DIR, latest)
+            weights = load_weights(weights_path)
             model.load_state_dict(weights)
+            # Try to load optimizer state
+            state_path = weights_path.replace(".safetensors", ".pt") if weights_path.endswith(".safetensors") else weights_path
             if os.path.exists(state_path):
+                state = torch.load(state_path, map_location=DEVICE, weights_only=False)
+                if isinstance(state, dict):
+                    try:
+                        if "optimizer" in state:
+                            optimizer.load_state_dict(state["optimizer"])
+                        if "scheduler" in state:
+                            scheduler.load_state_dict(state["scheduler"])
+                        start_epoch = state.get("epoch", 0)
+                    except:
+                        pass
             start_step = step
             print(f"✓ Loaded latest local: step {step}")
             return start_step, start_epoch
 config = TinyFluxConfig()
 model = TinyFlux(config).to(DEVICE).to(DTYPE)
 print(f"\nParams: {sum(p.numel() for p in model.parameters()):,}")
+model = torch.compile(model, mode="default")
 # ============================================================================
 # OPTIMIZER & SCHEDULER
 print(f"\nLoad target: {LOAD_TARGET}")
 start_step, start_epoch = load_checkpoint(model, opt, sched, LOAD_TARGET)
+# Override start_step if RESUME_STEP is set
+if RESUME_STEP is not None:
+    print(f"Overriding start_step: {start_step} -> {RESUME_STEP}")
+    start_step = RESUME_STEP
 # Log config to tensorboard
 writer.add_text("config", json.dumps(config.__dict__, indent=2), 0)
 writer.add_text("training_config", json.dumps({
     pbar = tqdm(loader, desc=f"E{ep+1}")
     for i, batch in enumerate(pbar):
+        latents = batch["latents"]      # Ground truth data (VAE encoded images)
         t5 = batch["t5_embeds"]
         clip = batch["clip_pooled"]
+        B, C, H, W = latents.shape
+        # ================================================================
+        # FLOW MATCHING FORMULATION
+        # ================================================================
+        # x_1 = data (what we want to generate)
+        # x_0 = noise (where we start at inference)
+        # x_t = (1-t)*x_0 + t*x_1  (linear interpolation)
+        #
+        # At t=0: x_t = x_0 (pure noise)
+        # At t=1: x_t = x_1 (pure data)
+        #
+        # Velocity field: v = dx/dt = x_1 - x_0
+        # Model learns to predict v given (x_t, t)
+        #
+        # At inference: start from noise, integrate v from t=0 to t=1
+        # ================================================================
+        # Reshape data to sequence format: (B, C, H, W) -> (B, H*W, C)
+        data = latents.permute(0, 2, 3, 1).reshape(B, H*W, C)  # x_1
+        noise = torch.randn_like(data)                         # x_0
+        # Sample timesteps with logit-normal distribution + Flux shift
+        # This biases training towards higher t (closer to data)
         t = torch.sigmoid(torch.randn(B, device=DEVICE))
+        t = flux_shift(t, s=SHIFT).to(DTYPE).clamp(1e-4, 1-1e-4)
+        # Create noisy samples via linear interpolation
+        t_expanded = t.view(B, 1, 1)
+        x_t = (1 - t_expanded) * noise + t_expanded * data  # Noisy sample at time t
+        # Target velocity: direction from noise to data
+        v_target = data - noise
+        # Create position IDs for RoPE
         img_ids = TinyFlux.create_img_ids(B, H, W, DEVICE)
+        # Random guidance scale (for CFG training)
+        guidance = torch.rand(B, device=DEVICE, dtype=DTYPE) * 4 + 1  # [1, 5]
+        # Forward pass: predict velocity
         with torch.autocast("cuda", dtype=DTYPE):
+            v_pred = model(
                 hidden_states=x_t,
                 encoder_hidden_states=t5,
                 pooled_projections=clip,
                 guidance=guidance,
             )
+        # Loss: MSE between predicted and target velocity
+        loss_raw = F.mse_loss(v_pred, v_target, reduction="none").mean(dim=[1, 2])
+        # Min-SNR weighting: downweight easy timesteps (near t=0 or t=1)
         snr_weights = min_snr_weight(t)
         loss = (loss_raw * snr_weights).mean() / GRAD_ACCUM
         loss.backward()