AbstractPhil
/

sd15-flow-lune

+"""
+    SD15 Flow-Matching trainer
+    Author: AbstractPhil
+    Loads the current format pt and ensures through multiple validations that the process is correct for training.
+    Trains flow matching for sd15.
+    License: MIT
+    If you use my work, a cite wouldnt hurt.
+"""
+import os
+import json
+import datetime
+from dataclasses import dataclass, asdict
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+import torch
+import torch.nn.functional as F
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+import datasets
+from diffusers import UNet2DConditionModel
+from huggingface_hub import HfApi, create_repo, hf_hub_download
+@dataclass
+class TrainConfig:
+    output_dir: str = "./outputs"
+    model_repo: str = "AbstractPhil/sd15-flow-matching-try2"
+    checkpoint_filename: str = "sd15_flowmatch_david_weighted_2_e34.pt"
+    dataset_name: str = "AbstractPhil/sd15-latent-distillation-500k"
+    # HuggingFace upload settings
+    hf_repo_id: str = "AbstractPhil/sd15-flow-lune"
+    upload_to_hub: bool = True
+    seed: int = 42
+    batch_size: int = 16
+    base_lr: float = 2e-6
+    shift: float = 2.0
+    dropout: float = 0.1
+    max_train_steps: int = 50_000
+    checkpointing_steps: int = 1000
+    num_workers: int = 0
+    # VAE scaling factor - multiply raw latents
+    vae_scale: float = 0.18215
+def load_student_unet(repo_id: str, filename: str, device="cuda") -> UNet2DConditionModel:
+    """Load UNet from .pt checkpoint containing student state_dict"""
+    # Download checkpoint from HuggingFace
+    print(f"Downloading checkpoint from {repo_id}/{filename}...")
+    checkpoint_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        repo_type="model"
+    )
+    print(f"✓ Downloaded to: {checkpoint_path}")
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    # Initialize UNet with SD1.5 config in fp32
+    print("Loading SD1.5 UNet architecture...")
+    unet = UNet2DConditionModel.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        subfolder="unet",
+        torch_dtype=torch.float32
+    )
+    # Get original state for comparison
+    original_state_dict = {k: v.clone() for k, v in unet.state_dict().items()}
+    # Load student weights and strip "unet." prefix
+    student_state_dict = checkpoint["student"]
+    # Strip prefix if present
+    cleaned_student_dict = {}
+    for key, value in student_state_dict.items():
+        if key.startswith("unet."):
+            cleaned_key = key[5:]  # Remove "unet." prefix
+            cleaned_student_dict[cleaned_key] = value
+        else:
+            cleaned_student_dict[key] = value
+    print(f"\n{'='*70}")
+    print("WEIGHT VERIFICATION")
+    print(f"{'='*70}")
+    # 1. Compare keys
+    original_keys = set(original_state_dict.keys())
+    student_keys = set(cleaned_student_dict.keys())
+    matching_keys = original_keys & student_keys
+    print(f"Original UNet keys: {len(original_keys)}")
+    print(f"Student checkpoint keys: {len(student_keys)}")
+    print(f"Matching keys: {len(matching_keys)}")
+    # 2. Compare student weights vs original BEFORE loading
+    total_params = 0
+    different_params = 0
+    mean_diff_sum = 0.0
+    max_diff = 0.0
+    for key in matching_keys:
+        if key not in original_state_dict or key not in cleaned_student_dict:
+            continue
+        orig = original_state_dict[key]
+        student = cleaned_student_dict[key].float()  # Convert to fp32 for comparison
+        if orig.shape != student.shape:
+            print(f"⚠ Shape mismatch for {key}: {orig.shape} vs {student.shape}")
+            continue
+        total_params += orig.numel()
+        # Check if weights are different
+        diff = (orig - student).abs()
+        if diff.max() > 1e-6:
+            different_params += orig.numel()
+            mean_diff_sum += diff.sum().item()
+            max_diff = max(max_diff, diff.max().item())
+    pct_different = (different_params / total_params * 100) if total_params > 0 else 0
+    avg_diff = mean_diff_sum / different_params if different_params > 0 else 0
+    print(f"\nStudent vs Original (BEFORE loading):")
+    print(f"  Total parameters: {total_params:,}")
+    print(f"  Parameters different: {different_params:,} ({pct_different:.1f}%)")
+    print(f"  Average difference: {avg_diff:.6f}")
+    print(f"  Max difference: {max_diff:.6f}")
+    # 3. Load weights
+    load_result = unet.load_state_dict(cleaned_student_dict, strict=False)
+    if load_result.missing_keys:
+        print(f"\n⚠ Missing keys during load: {len(load_result.missing_keys)}")
+        for key in load_result.missing_keys[:3]:
+            print(f"  - {key}")
+    if load_result.unexpected_keys:
+        print(f"⚠ Unexpected keys during load: {len(load_result.unexpected_keys)}")
+        for key in load_result.unexpected_keys[:3]:
+            print(f"  - {key}")
+    # 4. Verify weights actually changed after loading
+    loaded_state_dict = unet.state_dict()
+    total_params_after = 0
+    changed_params = 0
+    mean_diff_after = 0.0
+    max_diff_after = 0.0
+    for key in matching_keys:
+        if key not in original_state_dict or key not in loaded_state_dict:
+            continue
+        orig = original_state_dict[key]
+        loaded = loaded_state_dict[key]
+        total_params_after += orig.numel()
+        diff = (orig - loaded).abs()
+        if diff.max() > 1e-6:
+            changed_params += orig.numel()
+            mean_diff_after += diff.sum().item()
+            max_diff_after = max(max_diff_after, diff.max().item())
+    pct_changed = (changed_params / total_params_after * 100) if total_params_after > 0 else 0
+    avg_diff_after = mean_diff_after / changed_params if changed_params > 0 else 0
+    print(f"\nOriginal vs Loaded (AFTER loading):")
+    print(f"  Parameters changed: {changed_params:,} ({pct_changed:.1f}%)")
+    print(f"  Average difference: {avg_diff_after:.6f}")
+    print(f"  Max difference: {max_diff_after:.6f}")
+    print(f"\n{'='*70}")
+    # Verification checks
+    if pct_different < 50:
+        print(f"⚠️ WARNING: Student weights only {pct_different:.1f}% different from base!")
+        print("   This checkpoint may not be trained.")
+    elif pct_changed < 90:
+        print(f"⚠️ WARNING: Only {pct_changed:.1f}% of weights changed after loading!")
+        print("   The load may have failed.")
+    else:
+        print(f"✅ Weights loaded successfully!")
+        print(f"   Checkpoint step: {checkpoint.get('gstep', 'unknown')}")
+        print(f"   {pct_different:.1f}% of weights differ from base SD1.5")
+    print(f"{'='*70}\n")
+    return unet.to(device)
+def train(config: TrainConfig):
+    device = "cuda"
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.manual_seed(config.seed)
+    torch.cuda.manual_seed(config.seed)
+    # Setup output directory
+    date_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    real_output_dir = os.path.join(config.output_dir, date_time)
+    os.makedirs(real_output_dir, exist_ok=True)
+    t_writer = SummaryWriter(log_dir=real_output_dir, flush_secs=60)
+    # Initialize HuggingFace API
+    hf_api = None
+    if config.upload_to_hub:
+        try:
+            hf_api = HfApi()
+            create_repo(
+                repo_id=config.hf_repo_id,
+                repo_type="model",
+                exist_ok=True,
+                private=False
+            )
+            print(f"✓ HuggingFace repo ready: {config.hf_repo_id}")
+        except Exception as e:
+            print(f"⚠ Hub upload disabled: {e}")
+            config.upload_to_hub = False
+    # Save config locally and to hub
+    config_path = os.path.join(real_output_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(asdict(config), f, indent=2)
+    if config.upload_to_hub:
+        hf_api.upload_file(
+            path_or_fileobj=config_path,
+            path_in_repo="config.json",
+            repo_id=config.hf_repo_id,
+            repo_type="model"
+        )
+    # Load dataset in streaming mode
+    print(f"\nLoading dataset (streaming): {config.dataset_name}")
+    train_dataset = datasets.load_dataset(
+        config.dataset_name,
+        split="train",
+        streaming=True,
+        trust_remote_code=True
+    )
+    train_dataset = train_dataset.shuffle(seed=config.seed, buffer_size=1000)
+    print(f"✓ Dataset loaded in streaming mode")
+    def collate_fn(examples):
+        # Latents are RAW from VAE - need to scale them
+        latents = torch.stack([torch.tensor(ex["latent"]) for ex in examples])
+        latents = latents * config.vae_scale  # Scale: ~[-6, 6] -> ~[-1, 1]
+        clip_embeddings = torch.stack([torch.tensor(ex["clip_embedding"]) for ex in examples])
+        ids = [ex["id"] for ex in examples]
+        prompts = [ex["prompt"] for ex in examples]
+        return latents, clip_embeddings, ids, prompts
+    train_dataloader = DataLoader(
+        dataset=train_dataset,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers,
+    )
+    # Verify first batch latent range (on GPU for speed)
+    print("\nVerifying latent scaling on first batch...")
+    first_batch = next(iter(train_dataloader))
+    latents_check, _, _, _ = first_batch
+    print(f"Raw latent range: [{latents_check.min():.3f}, {latents_check.max():.3f}]")
+    latents_check = latents_check.to(device)
+    print(f"After GPU transfer: [{latents_check.min():.3f}, {latents_check.max():.3f}]")
+    print(f"Expected: ~[-1, 1] for properly scaled latents")
+    del latents_check
+    # Load pretrained student UNet
+    print(f"\nLoading model from HuggingFace...")
+    unet = load_student_unet(config.model_repo, config.checkpoint_filename, device=device)
+    unet.requires_grad_(True)
+    unet.enable_gradient_checkpointing()
+    unet.train()
+    optimizer = torch.optim.Adam(
+        unet.parameters(),
+        lr=config.base_lr * (config.batch_size ** 0.5),
+    )
+    global_step = 0
+    train_logs = {
+        "train_step": [],
+        "train_loss": [],
+        "train_timestep": [],
+        "trained_images": []
+    }
+    def get_prediction(batch, log_to=None):
+        latents, encoder_hidden_states, ids, prompts = batch
+        # Everything in fp32
+        latents = latents.to(dtype=torch.float32, device=device)
+        encoder_hidden_states = encoder_hidden_states.to(dtype=torch.float32, device=device)
+        batch_size = latents.shape[0]
+        # Apply dropout to conditioning for CFG support
+        dropout_mask = torch.rand(batch_size, device=device) < config.dropout
+        encoder_hidden_states = encoder_hidden_states.clone()
+        encoder_hidden_states[dropout_mask] = 0
+        # Sample timesteps with shift
+        sigmas = torch.rand(batch_size, device=device)
+        sigmas = (config.shift * sigmas) / (1 + (config.shift - 1) * sigmas)
+        timesteps = sigmas * 1000
+        sigmas = sigmas[:, None, None, None]
+        # Flow matching forward process
+        noise = torch.randn_like(latents)
+        noisy_latents = noise * sigmas + latents * (1 - sigmas)
+        target = noise - latents
+        # Predict velocity
+        pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+        loss = F.mse_loss(pred, target, reduction="none")
+        loss = loss.mean(dim=list(range(1, len(loss.shape))))
+        if log_to is not None:
+            for i in range(batch_size):
+                log_to["train_step"].append(global_step)
+                log_to["train_loss"].append(loss[i].item())
+                log_to["train_timestep"].append(timesteps[i].item())
+                log_to["trained_images"].append({
+                    "step": global_step,
+                    "id": ids[i],
+                    "prompt": prompts[i]
+                })
+        return loss.mean()
+    def plot_logs(log_dict):
+        plt.figure(figsize=(10, 6))
+        plt.scatter(
+            log_dict["train_timestep"],
+            log_dict["train_loss"],
+            s=3,
+            c=log_dict["train_step"],
+            marker=".",
+            cmap='cool'
+        )
+        plt.xlabel("timestep")
+        plt.ylabel("loss")
+        plt.yscale("log")
+        plt.colorbar(label="step")
+    def save_checkpoint(step):
+        checkpoint_path = os.path.join(real_output_dir, f"checkpoint-{step:08}")
+        os.makedirs(checkpoint_path, exist_ok=True)
+        # Save UNet weights as diffusers format
+        unet.save_pretrained(
+            os.path.join(checkpoint_path, "unet"),
+            safe_serialization=True
+        )
+        # Save complete checkpoint in .pt format
+        pt_filename = f"sd15_flow_lune_e{step//1000}_s{step}.pt"
+        pt_path = os.path.join(checkpoint_path, pt_filename)
+        torch.save({
+            "cfg": asdict(config),
+            "student": unet.state_dict(),
+            "opt": optimizer.state_dict(),
+            "gstep": step
+        }, pt_path)
+        # Save training metadata
+        metadata = {
+            "step": step,
+            "trained_images": train_logs["trained_images"]
+        }
+        metadata_path = os.path.join(checkpoint_path, "trained_images.json")
+        with open(metadata_path, "w") as f:
+            json.dump(metadata, f, indent=2)
+        print(f"✓ Checkpoint saved at step {step}")
+        # Upload to HuggingFace Hub
+        if config.upload_to_hub and hf_api is not None:
+            try:
+                hf_api.upload_file(
+                    path_or_fileobj=pt_path,
+                    path_in_repo=pt_filename,
+                    repo_id=config.hf_repo_id,
+                    repo_type="model"
+                )
+                hf_api.upload_folder(
+                    folder_path=os.path.join(checkpoint_path, "unet"),
+                    path_in_repo=f"checkpoint-{step:08}/unet",
+                    repo_id=config.hf_repo_id,
+                    repo_type="model"
+                )
+                hf_api.upload_file(
+                    path_or_fileobj=metadata_path,
+                    path_in_repo=f"checkpoint-{step:08}/trained_images.json",
+                    repo_id=config.hf_repo_id,
+                    repo_type="model"
+                )
+                print(f"✓ Uploaded to hub: {config.hf_repo_id}")
+            except Exception as e:
+                print(f"⚠ Upload failed: {e}")
+    print("\nStarting training...")
+    progress_bar = tqdm(range(0, config.max_train_steps))
+    for batch in train_dataloader:
+        loss = get_prediction(batch, log_to=train_logs)
+        t_writer.add_scalar("train/loss", loss.detach().item(), global_step)
+        loss.backward()
+        grad_norm = torch.nn.utils.clip_grad_norm_(unet.parameters(), 2.0)
+        t_writer.add_scalar("train/grad_norm", grad_norm.detach().item(), global_step)
+        optimizer.step()
+        optimizer.zero_grad()
+        progress_bar.update(1)
+        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
+        global_step += 1
+        if global_step % 100 == 0:
+            plot_logs(train_logs)
+            t_writer.add_figure("train_loss", plt.gcf(), global_step)
+            plt.close()
+        if global_step % config.checkpointing_steps == 0:
+            save_checkpoint(global_step)
+        if global_step >= config.max_train_steps:
+            save_checkpoint(global_step)
+            print("\n✅ Training complete!")
+            return
+if __name__ == "__main__":
+    config = TrainConfig()
+    train(config)