Upload folder using huggingface_hub

by Bangchis - opened Aug 13, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+798

-0

Files changed (5) hide show

config.yaml +97 -0
diffusion.py +429 -0
pytorch_model.bin +3 -0
requirements.txt +24 -0
unet.py +245 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+project: diffusion-from-scratch
+run_name: mnist32_small
+data:
+  dataset: mnist
+  image_size: 32        # resize MNIST 28 -> 32 (chia được cho UNet)
+  channels: 1
+  batch_size: 128
+  num_workers: 4
+opt:
+  lr: 0.0002
+  betas: [0.9, 0.999]
+  grad_clip: 1.0
+diffusion:
+  T: 400                # fewer steps for MNIST
+  beta_schedule: cosine
+  objective: pred_noise # start simple; later try pred_v
+  sampling_steps: 400    # < T => DDIM fast sampling
+  eta: 0.0
+  self_condition: false
+  clamp_x0: true
+  sample_every: 2000
+  sample_n: 64
+  learned_variance: false
+  var_loss_weight: 0.0
+  min_snr_loss_weight: false
+model:
+  dim: 32               # lightweight UNet
+  dim_mults: [1, 2, 4]  # shallow for MNIST
+  channels: 1
+  attn_heads: 2
+  attn_dim_head: 16
+  dropout: 0.0
+  self_condition: false
+  learned_variance: false
+  outer_attn: false     # turn off outer attention; keep only bottleneck attention
+train:
+  max_steps: 30000
+  log_every: 200
+  ckpt_dir: ./checkpoints
+  grad_accum: 1
+ema:
+  enabled: false
+  decay: 0.995
+  update_every: 10
+wandb:
+  enabled: true
+  mode: online
+  api_key_env:  b66dc9962d08bb26ff3fc4928703a13b30b2e9c9
+  tags: [mnist, small, bottleneck-attn]
+compute:
+  enable_tf32: true
+metrics:
+  # norms
+  global_norm_every: 1000
+  # FID / IS (optional; need clean-fid and torch-fidelity installed)
+  enable_fid: true
+  enable_is: true
+  fid_every: 4000
+  is_every: 4000
+  metric_num_gen: 5000
+  metric_batch_size: 32
+diffusion:
+  T: 400
+  beta_schedule: cosine
+  objective: pred_noise
+  sampling_steps: 400   # DDPM
+  eta: 0.0
+  sample_every: 1000
+  sample_n: 64
+viz:
+  enable_reverse_traj: true
+  reverse_every_steps: 4000      # log video thưa để nhẹ
+  reverse_record_every: 5        # ↓ số này => ghi nhiều snapshot hơn (1 = mượt nhất)
+  reverse_batch_n: 16
+  enable_forward_traj: true
+  forward_every_steps: 4000
+  forward_t_values: [0, 20, 40, 60, 80, 120, 160, 240, 320, 399]  # dày hơn chút
+  forward_batch_n: 16
+  video_fps: 16                  # tăng FPS (16–24) cho playback mượt hơn
+   # fps cao hơn để mượt

diffusion.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# diffusion_core.py
+# -- file này chứa các công thức toán cốt lõi của DDPM/DDIM
+# -- mục tiêu: tính các hệ số từ beta-schedule, và 4 hàm quan trọng:
+#    q_sample, predict_start_from_noise, predict_noise_from_start, q_posterior
+# diffusion_core.py
+# Core DDPM math: schedules and q/p transformations.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def extract(a, t, x_shape):
+    batch_size = t.shape[0]
+    out = a.gather(-1, t)
+    return out.reshape(batch_size, *((1,) * (len(x_shape) - 1)))
+def cosine_beta_schedule(timesteps, s=0.008):
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype=torch.float32) / timesteps
+    alphas_cumprod = torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1.0 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+class GaussianDiffusion(nn.Module):
+    """
+    Core diffusion module that wraps a denoiser (UNet):
+    - Precomputes diffusion constants (betas, alphas, etc.)
+    - Provides training loss (forward): randomly pick t, add noise, regress target
+    - Provides sampling loops (DDPM or DDIM)
+    The denoiser must have forward(x, t, [x_self_cond]), returning a predicted target
+    (epsilon, x0, or v depending on `objective`).
+    """
+    def __init__(self, model, *, image_size, timesteps=400, beta_schedule='cosine',
+                 objective='pred_noise', sampling_steps=None, eta=0.0,
+                 self_condition=False, auto_normalize=True, clamp_x0=True):
+        """
+        Args:
+            model (nn.Module): denoiser network (e.g., UNet).
+            image_size (int or (h,w)): training/sampling resolution (must match UNet).
+            timesteps (int): T. Smaller (e.g., 400) is enough for MNIST.
+            beta_schedule (str): only 'cosine' implemented here for simplicity.
+            objective (str): 'pred_noise'|'pred_x0'|'pred_v' (training target).
+            sampling_steps (int or None): if set < T => DDIM sampling with S steps; else DDPM full T.
+            eta (float): DDIM stochasticity (0.0 => deterministic).
+            self_condition (bool): optional self-conditioning flag.
+            auto_normalize (bool): map inputs [0,1] <-> [-1,1] inside module.
+            clamp_x0 (bool): clamp predicted x0 to [-1,1] during sampling for stability.
+        """
+        super().__init__()
+        self.model = model
+        param = next(model.parameters())
+        param_dtype = param.dtype
+        param_device = param.device
+        self.channels = model.channels
+        self.self_condition = self_condition
+        self.objective = objective
+        self.clamp_x0 = clamp_x0
+        # In-module normalization helpers (kept simple & explicit)
+        self.normalize = (lambda x: x * 2 -
+                          1) if auto_normalize else (lambda x: x)
+        self.unnormalize = (lambda x: (x + 1) *
+                            0.5) if auto_normalize else (lambda x: x)
+        # Normalize image_size to (H, W)
+        if isinstance(image_size, int):
+            image_size = (image_size, image_size)
+        self.image_size = image_size
+        # --- schedule setup ---
+        if beta_schedule != 'cosine':
+            raise NotImplementedError(
+                "For MNIST small, keep beta_schedule='cosine'")
+        betas = cosine_beta_schedule(timesteps).to(
+            device=param_device, dtype=param_dtype)  # shape [T]
+        alphas = 1.0 - betas                        # alpha_t
+        alphas_cumprod = torch.cumprod(alphas, dim=0)        # alpha_bar_t
+        alphas_cumprod_prev = F.pad(
+            alphas_cumprod[:-1], (1, 0), value=1.0)  # alpha_bar_{t-1}
+        # Timesteps used in training and sampling
+        self.num_timesteps = int(betas.shape[0])
+        self.sampling_steps = int(
+            sampling_steps) if sampling_steps else self.num_timesteps
+        self.is_ddim_sampling = self.sampling_steps < self.num_timesteps
+        self.ddim_sampling_eta = float(eta)
+        # Register constants as buffers (moved with .to(device), saved in state_dict)
+        self.register_buffer('betas', betas)
+        self.register_buffer('alphas_cumprod', alphas_cumprod)
+        self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)
+        self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             torch.sqrt(1.0 - alphas_cumprod))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             torch.sqrt(1.0 / alphas_cumprod))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             torch.sqrt(1.0 / alphas_cumprod - 1.0))
+        # Posterior q(x_{t-1} | x_t, x_0) parameters
+        posterior_variance = betas * \
+            (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        self.register_buffer('posterior_variance', posterior_variance)
+        self.register_buffer('posterior_log_variance_clipped', torch.log(
+            posterior_variance.clamp(min=1e-20)))
+        self.register_buffer('posterior_mean_coef1', betas *
+                             torch.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod))
+        self.register_buffer('posterior_mean_coef2', (1.0 - alphas_cumprod_prev)
+                             * torch.sqrt(1.0 - betas) / (1.0 - alphas_cumprod))
+        # Optional loss re-weighting by SNR (kept simple here)
+        snr = alphas_cumprod / (1 - alphas_cumprod)
+        if objective == 'pred_noise':
+            loss_weight = snr / snr    # becomes 1
+        elif objective == 'pred_x0':
+            loss_weight = snr
+        else:  # pred_v
+            loss_weight = snr / (snr + 1)
+        self.register_buffer('loss_weight', loss_weight)
+    @property
+    def device(self):
+        """Convenience: returns the device where buffers live."""
+        return self.betas.device
+    # ----------------------
+    # Forward diffusion (q)
+    # ----------------------
+    def q_sample(self, x0, t, noise=None):
+        """
+        Sample x_t from q(x_t | x_0):
+            x_t = sqrt(alpha_bar_t) * x0 + sqrt(1 - alpha_bar_t) * noise
+        """
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return extract(self.sqrt_alphas_cumprod, t, x0.shape) * x0 + \
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise
+    # ---------------------------------
+    # Converters between parameterizations
+    # ---------------------------------
+    def predict_start_from_noise(self, x_t, t, eps):
+        """Given epsilon prediction, reconstruct x0."""
+        return extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+    def predict_noise_from_start(self, x_t, t, x0):
+        """Given x0 prediction, reconstruct epsilon."""
+        return (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def predict_v(self, x0, t, eps):
+        """v-parameterization = sqrt(alpha_bar)*eps - sqrt(1-alpha_bar)*x0."""
+        return extract(self.alphas_cumprod.sqrt(), t, x0.shape) * eps - \
+            extract((1.0 - self.alphas_cumprod).sqrt(), t, x0.shape) * x0
+    def predict_start_from_v(self, x_t, t, v):
+        """Given v prediction, reconstruct x0."""
+        return extract(self.alphas_cumprod.sqrt(), t, x_t.shape) * x_t - \
+            extract((1.0 - self.alphas_cumprod).sqrt(), t, x_t.shape) * v
+    # ---------------------------------
+    # Model predictions at time t
+    # ---------------------------------
+    def model_predictions(self, x, t, x_self_cond=None, clip_x_start=False, rederive_pred_noise=False):
+        """
+        Run the denoiser and return (pred_noise, x0):
+        - If objective == pred_noise: UNet predicts epsilon directly.
+        - If objective == pred_x0:    UNet predicts x0 directly.
+        - If objective == pred_v:     UNet predicts v; we convert to x0 & epsilon.
+        Args:
+            x (Tensor): noised image x_t.
+            t (LongTensor): time indices.
+            x_self_cond (Tensor|None): optional self-conditioning input.
+            clip_x_start (bool): clamp x0 to [-1,1] after prediction.
+            rederive_pred_noise (bool): if True, recompute epsilon from clamped x0.
+        Returns:
+            (pred_noise, x0) both shape like x.
+        """
+        out = self.model(
+            x, t, x_self_cond) if x_self_cond is not None else self.model(x, t)
+        maybe_clip = (lambda z: z.clamp(-1, 1)
+                      ) if clip_x_start else (lambda z: z)
+        if self.objective == 'pred_noise':
+            pred_noise = out
+            x0 = self.predict_start_from_noise(x, t, pred_noise)
+            x0 = maybe_clip(x0)
+            if clip_x_start and rederive_pred_noise:
+                pred_noise = self.predict_noise_from_start(x, t, x0)
+        elif self.objective == 'pred_x0':
+            x0 = maybe_clip(out)
+            pred_noise = self.predict_noise_from_start(x, t, x0)
+        else:  # 'pred_v'
+            v = out
+            x0 = self.predict_start_from_v(x, t, v)
+            x0 = maybe_clip(x0)
+            pred_noise = self.predict_noise_from_start(x, t, x0)
+        return pred_noise, x0
+    def q_posterior(self, x0, x_t, t):
+        """
+        Compute the Gaussian q(x_{t-1} | x_t, x0) parameters:
+            mean = c1 * x0 + c2 * x_t
+            var, log_var: closed-form from betas and alpha_bars.
+        """
+        mean = extract(self.posterior_mean_coef1, t, x_t.shape) * x0 + \
+            extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        var = extract(self.posterior_variance, t, x_t.shape)
+        log_var = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return mean, var, log_var
+    # ----------------------
+    # Training loss (forward)
+    # ----------------------
+    def p_losses(self, x_start, t, noise=None):
+        """
+        DDPM training objective:
+        - Sample x_t = q(x_t | x_0)
+        - Predict target according to objective and MSE it
+        - (Optional) self-conditioning can be added outside for simplicity
+        """
+        noise = torch.randn_like(x_start) if noise is None else noise
+        x = self.q_sample(x_start, t, noise)
+        x_self_cond = None
+        if self.self_condition and torch.rand(1, device=self.device) < 0.5:
+            # simple self-conditioning: predict x0 once and feed back
+            with torch.no_grad():
+                _, x_self_cond = self.model_predictions(
+                    x, t, None, clip_x_start=True)
+        model_out = self.model(
+            x, t, x_self_cond) if x_self_cond is not None else self.model(x, t)
+        if self.objective == 'pred_noise':
+            target = noise
+        elif self.objective == 'pred_x0':
+            target = x_start
+        else:  # pred_v
+            v = self.predict_v(x_start, t, noise)
+            target = v
+        # MSE over channels/spatial dims -> mean over batch
+        loss = F.mse_loss(model_out, target, reduction='none')
+        loss = loss.mean(dim=list(range(1, loss.ndim)))  # average over C,H,W
+        # snr-based weight (here often ==1)
+        loss = loss * extract(self.loss_weight, t, loss.shape)
+        return loss.mean()
+    def forward(self, img):
+        """
+        Training entry point:
+        - Normalize to [-1,1]
+        - Draw random timesteps
+        - Compute loss
+        """
+        img = img.to(device=self.device, dtype=next(
+            self.model.parameters()).dtype)
+        b, c, h, w = img.shape
+        assert (
+            h, w) == self.image_size, f"image must be {self.image_size}, got {(h,w)}"
+        t = torch.randint(0, self.num_timesteps, (b,),
+                          device=img.device).long()
+        img = self.normalize(img)
+        return self.p_losses(img, t)
+    # ----------------------
+    # Single DDPM step p(x_{t-1}|x_t)
+    # ----------------------
+    @torch.inference_mode()
+    def p_sample(self, x, t: int, x_self_cond=None):
+        """
+        Compute one reverse step:
+            - predict (epsilon, x0), compute posterior q(x_{t-1}|x_t, x0)
+            - sample from that Gaussian (add noise except at t=0)
+        """
+        b = x.shape[0]
+        tt = torch.full((b,), t, device=self.device, dtype=torch.long)
+        pred_noise, x0 = self.model_predictions(
+            x, tt, x_self_cond, clip_x_start=True)
+        mean, _, log_var = self.q_posterior(x0, x, tt)
+        noise = torch.randn_like(x) if t > 0 else 0.0
+        return mean + (0.5 * log_var).exp() * noise, x0
+    # ----------------------
+    # Sampling loops
+    # ----------------------
+    @torch.inference_mode()
+    def ddpm_sample(self, shape):
+        """
+        DDPM sampling with T steps (slow, high quality).
+        """
+        img = torch.randn(shape, device=self.device)
+        x0 = None
+        for t in reversed(range(self.num_timesteps)):
+            self_cond = x0 if self.self_condition else None
+            img, x0 = self.p_sample(img, t, self_cond)
+        return self.unnormalize(img)
+    @torch.inference_mode()
+    def ddim_sample(self, shape):
+        """
+        DDIM sampling with S < T steps (fast, often good quality).
+        Deterministic when eta=0.0.
+        """
+        T, S, eta = self.num_timesteps, self.sampling_steps, self.ddim_sampling_eta
+        # create a decreasing time index schedule of length S+1: [T-1, ..., 0, -1]
+        times = torch.linspace(-1, T - 1, steps=S + 1,
+                               device=self.device).long().flip(0)
+        pairs = list(zip(times[:-1].tolist(), times[1:].tolist()))
+        img = torch.randn(shape, device=self.device)
+        x0 = None
+        for t, t_next in pairs:
+            tt = torch.full(
+                (shape[0],), t, device=self.device, dtype=torch.long)
+            pred_noise, x0 = self.model_predictions(
+                img, tt, None, clip_x_start=True, rederive_pred_noise=True)
+            if t_next < 0:
+                # final step: directly set to predicted x0
+                img = x0
+                continue
+            a_t, a_next = self.alphas_cumprod[t], self.alphas_cumprod[t_next]
+            sigma = eta * ((1 - a_t / a_next) *
+                           (1 - a_next) / (1 - a_t)).sqrt()
+            c = (1 - a_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(img)
+            # DDIM update rule
+            img = x0 * a_next.sqrt() + c * pred_noise + sigma * noise
+        return self.unnormalize(img)
+    @torch.inference_mode()
+    def sample(self, batch_size=16):
+        """
+        Public sampling API:
+            - choose DDPM or DDIM depending on `sampling_steps`
+            - returns a batch of images in [0,1]
+        """
+        H, W = self.image_size
+        fn = self.ddim_sample if self.is_ddim_sampling else self.ddpm_sample
+        return fn((batch_size, self.channels, H, W))
+    # In diffusion_core.py (add these methods inside GaussianDiffusion)
+    # ----------------------
+    # DDPM sampling with trajectory recording and foward transformations
+    # ----------------------
+    @torch.inference_mode()
+    def ddpm_sample_trajectory(self, shape, record_every=50, return_x0=False):
+        """
+        DDPM sampling but also record intermediate frames.
+        - record_every: save a snapshot every N steps (also includes first/last).
+        - return_x0: if True, also store predicted x0 at the same checkpoints.
+        Returns:
+            final_img [B,C,H,W] in [0,1],
+            frames_xt: list of tensors in [0,1], each [B,C,H,W]
+            frames_x0 (or None): same length as frames_xt if return_x0=True
+        """
+        img = torch.randn(shape, device=self.device)
+        frames_xt = []
+        frames_x0 = [] if return_x0 else None
+        x0 = None
+        T = self.num_timesteps
+        for t in reversed(range(T)):
+            # record current x_t before stepping
+            if t == T - 1 or t == 0 or (t % record_every) == 0:
+                # unnormalize for visualization (to [0,1])
+                frames_xt.append(self.unnormalize(img.clamp(-1, 1)))
+                if return_x0 and x0 is not None:
+                    frames_x0.append(self.unnormalize(x0.clamp(-1, 1)))
+            self_cond = x0 if self.self_condition else None
+            img, x0 = self.p_sample(img, t, self_cond)
+        # record the final image
+        frames_xt.append(self.unnormalize(img.clamp(-1, 1)))
+        if return_x0:
+            frames_x0.append(self.unnormalize(x0.clamp(-1, 1)))
+        return self.unnormalize(img), frames_xt, frames_x0
+    @torch.no_grad()
+    def forward_noising_trajectory(self, x0, t_values):
+        """
+        Visualize forward diffusion q(x_t | x_0) at selected t.
+        Args:
+            x0: clean images in [0,1], [B,C,H,W]
+            t_values: list/iterable of ints (0..T-1)
+        Returns:
+            frames_xt: list of tensors in [0,1], each [B,C,H,W]
+        """
+        # normalize like training path
+        x0n = self.normalize(x0.to(self.device))
+        frames = []
+        for t in t_values:
+            tt = torch.full((x0n.size(0),), int(
+                t), device=self.device, dtype=torch.long)
+            xt = self.q_sample(x0n, tt)               # in [-1,1] domain
+            # map back to [0,1] for viewing
+            frames.append(self.unnormalize(xt))
+        return frames

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b434bb9f31f1b7204aa76c4b93881e896f3b0280233d4237518357cb71cd14d5
+size 33190930

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# Use PyTorch CUDA 12.1 wheels for torch/torchvision
+--index-url https://download.pytorch.org/whl/cu121
+torch==2.3.1
+torchvision==0.18.1
+# Core utils
+pyyaml>=6.0.1
+tqdm>=4.66.0
+numpy>=1.26.0
+Pillow>=10.0.0
+einops>=0.7.0
+# Logging & videos
+wandb>=0.16.0
+imageio>=2.31.0
+imageio-ffmpeg>=0.4.9   # để ghi MP4 mà không cần ffmpeg hệ thống
+# Metrics (nếu bật FID/IS)
+clean-fid>=0.1.35
+torch-fidelity>=0.3.0
+# (Tùy chọn) Đẩy model lên Hugging Face
+huggingface_hub>=0.23.0

unet.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# unet.py
+# Lightweight UNet for MNIST:
+# - Optional outer attention disabled (Identity)
+# - Full attention only at bottleneck
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+def divisible_by(x, y):
+    return x % y == 0
+class RMSNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.weight = nn.Parameter(torch.ones(1, dim, 1, 1))
+    def forward(self, x):
+        return F.normalize(x, dim=1) * self.weight * self.scale
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+    def forward(self, t):
+        device = t.device
+        half = self.dim // 2
+        freqs = torch.exp(torch.arange(half, device=device)
+                          * -(torch.log(torch.tensor(self.theta)) / (half - 1)))
+        args = t.float()[:, None] * freqs[None, :]
+        return torch.cat([args.sin(), args.cos()], dim=-1)
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, dropout=0.):
+        super().__init__()
+        self.project = nn.Conv2d(dim, dim_out, kernel_size=3, padding=1)
+        self.norm = RMSNorm(dim_out)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = nn.SiLU()
+    def forward(self, x, shift_scale=None):
+        x = self.project(x)
+        x = self.norm(x)
+        if shift_scale is not None:
+            s, b = shift_scale
+            x = x * (s + 1) + b
+        x = self.dropout(self.activation(x))
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, dim_out, *, time_emb_dim=None, dropout=0.):
+        super().__init__()
+        self.mlp = nn.Sequential(nn.SiLU(), nn.Linear(
+            time_emb_dim, dim_out * 2)) if time_emb_dim else None
+        self.b1 = Block(dim, dim_out, dropout=dropout)
+        self.b2 = Block(dim_out, dim_out, dropout=0.)
+        self.skip = nn.Conv2d(
+            dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, t=None):
+        scale_shift = None
+        if self.mlp is not None and t is not None:
+            emb = self.mlp(t).view(t.size(0), -1, 1, 1)
+            scale_shift = emb.chunk(2, dim=1)
+        h = self.b1(x, scale_shift)
+        h = self.b2(h)
+        return h + self.skip(x)
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=2, dim_head=16):
+        super().__init__()
+        self.heads = heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim_head * heads * 3, 1, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(dim_head * heads, dim, 1), RMSNorm(dim))
+        self.scale = dim_head ** -0.5
+    def forward(self, x):
+        b, c, h, w = x.shape
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).chunk(3, dim=1)
+        q = rearrange(q, 'b (h d) x y -> b h d (x y)', h=self.heads)
+        k = rearrange(k, 'b (h d) x y -> b h d (x y)', h=self.heads)
+        v = rearrange(v, 'b (h d) x y -> b h d (x y)', h=self.heads)
+        q = torch.softmax(q, dim=-2) * self.scale
+        k = torch.softmax(k, dim=-1)
+        ctx = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+        out = torch.einsum('b h d e, b h d n -> b h e n', ctx, q)
+        out = rearrange(out, 'b h c (x y) -> b (h c) x y', x=h, y=w)
+        return self.to_out(out)
+class FullAttention(nn.Module):
+    def __init__(self, dim, heads=2, dim_head=16):
+        super().__init__()
+        self.heads = heads
+        inner = heads * dim_head
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Conv2d(dim, inner * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(inner, dim, 1)
+        self.scale = dim_head ** -0.5
+    def forward(self, x):
+        b, c, h, w = x.shape
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x).chunk(3, dim=1)
+        q = rearrange(q, 'b (h d) x y -> b h (x y) d', h=self.heads)
+        k = rearrange(k, 'b (h d) x y -> b h (x y) d', h=self.heads)
+        v = rearrange(v, 'b (h d) x y -> b h (x y) d', h=self.heads)
+        attn = torch.softmax((q @ k.transpose(-1, -2)) * self.scale, dim=-1)
+        out = attn @ v
+        out = rearrange(out, 'b h (x y) d -> b (h d) x y', x=h, y=w)
+        return self.to_out(out) + x
+class UNet(nn.Module):
+    """
+    Minimal UNet for MNIST 32x32.
+    - outer_attn=False -> use Identity in outer levels
+    - FullAttention at bottleneck only
+    """
+    def __init__(self, dim=32, init_dim=None, out_dim=None, dim_mults=(1, 2, 4),
+                 channels=1, dropout=0.0, attn_heads=2, attn_dim_head=16,
+                 self_condition=False, learned_variance=False, outer_attn=False):
+        super().__init__()
+        self.channels = channels
+        self.self_condition = self_condition
+        self.learned_variance = learned_variance
+        in_ch = channels * (2 if self_condition else 1)
+        init_dim = init_dim or dim
+        self.init_conv = nn.Conv2d(in_ch, init_dim, 7, padding=3)
+        dims = [init_dim, *[dim * m for m in dim_mults]]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        time_dim = dim * 4
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(dim),
+            nn.Linear(dim, time_dim),
+            nn.GELU(),
+            nn.Linear(time_dim, time_dim)
+        )
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        for i, (d_in, d_out) in enumerate(in_out):
+            is_last = i == (len(in_out) - 1)
+            attn_mod = LinearAttention(
+                d_in, heads=attn_heads, dim_head=attn_dim_head) if outer_attn else nn.Identity()
+            self.downs.append(nn.ModuleList([
+                ResnetBlock(d_in, d_in, time_emb_dim=time_dim,
+                            dropout=dropout),
+                ResnetBlock(d_in, d_in, time_emb_dim=time_dim,
+                            dropout=dropout),
+                attn_mod,
+                (nn.Conv2d(d_in, d_out, 3, padding=1) if is_last else
+                 nn.Sequential(nn.Conv2d(d_in, d_in, 4, stride=2, padding=1),
+                               nn.Conv2d(d_in, d_out, 3, padding=1)))
+            ]))
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(
+            mid_dim, mid_dim, time_emb_dim=time_dim, dropout=dropout)
+        self.mid_attn = FullAttention(
+            mid_dim, heads=attn_heads, dim_head=attn_dim_head)  # bottleneck
+        self.mid_block2 = ResnetBlock(
+            mid_dim, mid_dim, time_emb_dim=time_dim, dropout=dropout)
+        for i, (d_in, d_out) in enumerate(reversed(in_out)):
+            is_last = i == (len(in_out) - 1)
+            attn_mod_up = LinearAttention(
+                d_out, heads=attn_heads, dim_head=attn_dim_head) if outer_attn else nn.Identity()
+            self.ups.append(nn.ModuleList([
+                ResnetBlock(d_out + d_in, d_out,
+                            time_emb_dim=time_dim, dropout=dropout),
+                ResnetBlock(d_out + d_in, d_out,
+                            time_emb_dim=time_dim, dropout=dropout),
+                attn_mod_up,
+                (nn.Conv2d(d_out, d_in, 3, padding=1) if is_last else
+                 nn.Sequential(nn.ConvTranspose2d(d_out, d_out, 4, stride=2, padding=1),
+                               nn.Conv2d(d_out, d_in, 3, padding=1)))
+            ]))
+        self.out_dim = out_dim or channels  # learned_variance=False for MNIST
+        self.final_res_block = ResnetBlock(
+            init_dim * 2, init_dim, time_emb_dim=time_dim, dropout=dropout)
+        self.final_conv = nn.Conv2d(init_dim, self.out_dim, 1)
+    @property
+    def downsample_factor(
+        self): return 2 ** (len(self.downs) - 1)  # (len=3) -> 4
+    def forward(self, x, time, x_self_cond=None):
+        assert all(divisible_by(d, self.downsample_factor)
+                   for d in x.shape[-2:])
+        if self.self_condition:
+            if x_self_cond is None:
+                x_self_cond = torch.zeros_like(x)
+            x = torch.cat([x_self_cond, x], dim=1)
+        x = self.init_conv(x)
+        r = x.clone()
+        t = self.time_mlp(time)
+        hs = []
+        for b1, b2, attn, down in self.downs:
+            x = b1(x, t)
+            hs.append(x)
+            x = b2(x, t)
+            x = attn(x) + x
+            hs.append(x)
+            x = down(x)
+        x = self.mid_block1(x, t)
+        x = self.mid_attn(x) + x
+        x = self.mid_block2(x, t)
+        for b1, b2, attn, up in self.ups:
+            x = torch.cat([x, hs.pop()], dim=1)
+            x = b1(x, t)
+            x = torch.cat([x, hs.pop()], dim=1)
+            x = b2(x, t)
+            x = attn(x) + x
+            x = up(x)
+        x = torch.cat([x, r], dim=1)
+        x = self.final_res_block(x, t)
+        return self.final_conv(x)