File size: 7,428 Bytes

54c8086

"""
U-Net architecture for conditional diffusion on spatiotemporal PDE data.
Supports non-square inputs, time conditioning, and skip connections.
"""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class SinusoidalPosEmb(nn.Module):
    """Sinusoidal positional embedding for diffusion timestep."""

    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, t):
        half = self.dim // 2
        emb = math.log(10000) / (half - 1)
        emb = torch.exp(torch.arange(half, device=t.device) * -emb)
        emb = t[:, None].float() * emb[None, :]
        return torch.cat([emb.sin(), emb.cos()], dim=-1)


class ResBlock(nn.Module):
    """Residual block with group norm, SiLU, and time embedding injection."""

    def __init__(self, in_ch, out_ch, time_dim, dropout=0.1):
        super().__init__()
        self.norm1 = nn.GroupNorm(min(32, in_ch), in_ch)
        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
        self.time_mlp = nn.Sequential(nn.SiLU(), nn.Linear(time_dim, out_ch))
        self.norm2 = nn.GroupNorm(min(32, out_ch), out_ch)
        self.dropout = nn.Dropout(dropout)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
        self.skip = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()

    def forward(self, x, t_emb):
        h = F.silu(self.norm1(x))
        h = self.conv1(h)
        h = h + self.time_mlp(t_emb)[:, :, None, None]
        h = F.silu(self.norm2(h))
        h = self.dropout(h)
        h = self.conv2(h)
        return h + self.skip(x)


class SelfAttention(nn.Module):
    """Multi-head self-attention on spatial features."""

    def __init__(self, channels, num_heads=4):
        super().__init__()
        self.norm = nn.GroupNorm(min(32, channels), channels)
        self.attn = nn.MultiheadAttention(channels, num_heads, batch_first=True)

    def forward(self, x):
        B, C, H, W = x.shape
        h = self.norm(x).reshape(B, C, H * W).permute(0, 2, 1)
        h, _ = self.attn(h, h, h)
        h = h.permute(0, 2, 1).reshape(B, C, H, W)
        return x + h


class Downsample(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.conv = nn.Conv2d(ch, ch, 3, stride=2, padding=1)

    def forward(self, x):
        return self.conv(x)


class Upsample(nn.Module):
    def __init__(self, ch):
        super().__init__()
        self.conv = nn.Conv2d(ch, ch, 3, padding=1)

    def forward(self, x):
        x = F.interpolate(x, scale_factor=2, mode="nearest")
        return self.conv(x)


class UNet(nn.Module):
    """U-Net for conditional diffusion.

    Condition (e.g. previous frame) is concatenated to the noisy input along
    the channel dimension *before* being passed to forward().  So set
    ``in_channels = output_channels + condition_channels``.

    Args:
        in_channels: noisy-target channels + condition channels.
        out_channels: channels to predict (same as target).
        base_ch: base channel width.
        ch_mults: per-level channel multipliers.
        n_res: residual blocks per level.
        attn_levels: which levels get self-attention (0-indexed).
        dropout: dropout rate.
        time_dim: timestep embedding dimension.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        base_ch=64,
        ch_mults=(1, 2, 4, 8),
        n_res=2,
        attn_levels=(3,),
        dropout=0.1,
        time_dim=256,
    ):
        super().__init__()
        self.n_res = n_res
        self.ch_mults = ch_mults

        # --- time embedding ---
        self.time_embed = nn.Sequential(
            SinusoidalPosEmb(time_dim),
            nn.Linear(time_dim, time_dim * 4),
            nn.SiLU(),
            nn.Linear(time_dim * 4, time_dim),
        )

        # --- input projection ---
        self.input_conv = nn.Conv2d(in_channels, base_ch, 3, padding=1)

        # --- downsampling path ---
        self.downs = nn.ModuleList()
        ch = base_ch
        skip_chs = [ch]  # track channel dims for skip connections

        for lvl, mult in enumerate(ch_mults):
            out_ch = base_ch * mult
            for _ in range(n_res):
                self.downs.append(
                    nn.ModuleDict(
                        {
                            "res": ResBlock(ch, out_ch, time_dim, dropout),
                            **(
                                {"attn": SelfAttention(out_ch)}
                                if lvl in attn_levels
                                else {}
                            ),
                        }
                    )
                )
                ch = out_ch
                skip_chs.append(ch)
            if lvl < len(ch_mults) - 1:
                self.downs.append(nn.ModuleDict({"down": Downsample(ch)}))
                skip_chs.append(ch)

        # --- middle ---
        self.mid_res1 = ResBlock(ch, ch, time_dim, dropout)
        self.mid_attn = SelfAttention(ch)
        self.mid_res2 = ResBlock(ch, ch, time_dim, dropout)

        # --- upsampling path ---
        self.ups = nn.ModuleList()
        for lvl in reversed(range(len(ch_mults))):
            out_ch = base_ch * ch_mults[lvl]
            for _ in range(n_res + 1):  # +1 to consume downsample skip
                skip_ch = skip_chs.pop()
                self.ups.append(
                    nn.ModuleDict(
                        {
                            "res": ResBlock(ch + skip_ch, out_ch, time_dim, dropout),
                            **(
                                {"attn": SelfAttention(out_ch)}
                                if lvl in attn_levels
                                else {}
                            ),
                        }
                    )
                )
                ch = out_ch
            if lvl > 0:
                self.ups.append(nn.ModuleDict({"up": Upsample(ch)}))

        # --- output projection ---
        self.out_norm = nn.GroupNorm(min(32, ch), ch)
        self.out_conv = nn.Conv2d(ch, out_channels, 3, padding=1)

    def forward(self, x, t, cond=None):
        """
        Args:
            x: noisy target  [B, C_out, H, W]
            t: diffusion timestep [B] (int or float)
            cond: condition   [B, C_cond, H, W] (optional, concatenated)
        Returns:
            predicted noise  [B, C_out, H, W]
        """
        if cond is not None:
            x = torch.cat([x, cond], dim=1)

        t_emb = self.time_embed(t)
        h = self.input_conv(x)

        # --- down ---
        skips = [h]
        for block in self.downs:
            if "down" in block:
                h = block["down"](h)
                skips.append(h)
            else:
                h = block["res"](h, t_emb)
                if "attn" in block:
                    h = block["attn"](h)
                skips.append(h)

        # --- middle ---
        h = self.mid_res1(h, t_emb)
        h = self.mid_attn(h)
        h = self.mid_res2(h, t_emb)

        # --- up ---
        for block in self.ups:
            if "up" in block:
                h = block["up"](h)
            else:
                s = skips.pop()
                h = torch.cat([h, s], dim=1)
                h = block["res"](h, t_emb)
                if "attn" in block:
                    h = block["attn"](h)

        h = F.silu(self.out_norm(h))
        return self.out_conv(h)