Project-Ground-Zero
/

Flashscape-V0

Model card Files Files and versions

xet

Community

Fgdfgfthgr commited on Oct 6, 2025

Commit

110b2ff

verified ·

1 Parent(s): 4936d6c

Delete network_diffusion_unet.py

Browse files

Files changed (1) hide show

network_diffusion_unet.py +0 -389

network_diffusion_unet.py DELETED Viewed

@@ -1,389 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-from torch.utils.checkpoint import checkpoint
-class SinusoidalEmbedding(nn.Module):
-    def __init__(self, embedding_dim=128, base=1000, scaling=1000):
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        half_dim = embedding_dim // 2
-        freqs = torch.exp(-math.log(base) * torch.arange(0, half_dim) / half_dim)
-        # at base 1000, max-range = +=500pi = -1571 to 1571
-        self.scaling = nn.parameter.Buffer(torch.tensor(scaling))
-        self.freqs = nn.parameter.Buffer(freqs)
-    def forward(self, scaler):
-        scaler = scaler * self.scaling
-        args = scaler[:, None] * self.freqs[None]
-        embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
-        return embedding
-class SinusoidalPositionalEmbedding2D(nn.Module):
-    def __init__(self, embedding_dim):
-        super().__init__()
-        assert embedding_dim % 2 == 0, "embedding_dim must be even"
-        self.embedding_dim = embedding_dim
-        half_dim = self.embedding_dim // 2
-        div_term = torch.exp(torch.arange(0, half_dim, 2) * (-math.log(100.0) / half_dim))
-        # Since our grid size is small, 100 should be enough
-        self.div_term = nn.parameter.Buffer(div_term.to(torch.float32))
-    def forward(self, height, width):
-        """Generate embeddings for a grid of size (height, width)."""
-        # Generate grid coordinates
-        y_pos = torch.arange(height, dtype=torch.float32, device=self.div_term.device)
-        x_pos = torch.arange(width, dtype=torch.float32, device=self.div_term.device)
-        # Compute sinusoidal components for height and width
-        y_sin = torch.sin(y_pos[:, None] * self.div_term[None, :])
-        y_cos = torch.cos(y_pos[:, None] * self.div_term[None, :])
-        x_sin = torch.sin(x_pos[:, None] * self.div_term[None, :])
-        x_cos = torch.cos(x_pos[:, None] * self.div_term[None, :])
-        # Interleave sin and cos components
-        y_embed = torch.stack([y_sin, y_cos], dim=-1).view(height, -1)
-        x_embed = torch.stack([x_sin, x_cos], dim=-1).view(width, -1)
-        # Combine height and width embeddings
-        pos_embed = torch.cat([y_embed[:, None, :].expand(-1, width, -1),
-                               x_embed[None, :, :].expand(height, -1, -1)], dim=-1)
-        return pos_embed.view(height * width, self.embedding_dim)
-class ImageLinearAttention(nn.Module):
-    def __init__(self, chan, kernel_size=3, heads=4, norm_queries=True, embd_dim=None):
-        super().__init__()
-        self.chan = chan
-        self.heads = heads
-        self.key_dim = key_dim = chan // heads
-        self.value_dim = value_dim = chan // heads
-        self.norm_queries = norm_queries
-        # Convolutional projections for Q, K, V
-        self.to_q = nn.Conv2d(chan, key_dim * heads, kernel_size, padding='same', padding_mode='replicate')
-        self.to_k = nn.Conv2d(chan, key_dim * heads, kernel_size, padding='same', padding_mode='replicate')
-        self.to_v = nn.Conv2d(chan, value_dim * heads, kernel_size, padding='same', padding_mode='replicate')
-        self.to_out = nn.Conv2d(value_dim * heads, chan, kernel_size, padding='same', padding_mode='replicate')
-        # Adaptive normalization: Project embedding to scale/shift for group norm
-        if embd_dim is not None:
-            self.norm = nn.GroupNorm(1, key_dim * heads, affine=False)  # Normalize without inherent affine params
-            self.emb_proj = nn.Linear(embd_dim, 2 * key_dim * heads)  # Project emb to scale/shift
-        else:
-            self.norm = nn.GroupNorm(1, key_dim * heads, affine=True)
-            self.emb_proj = None
-    def forward(self, x, emb=None):
-        b, c, h, w = x.shape
-        heads = self.heads
-        key_dim = self.key_dim
-        # Project input to queries, keys, and values
-        q = self.to_q(x)
-        k = self.to_k(x)
-        v = self.to_v(x)
-        # Apply adaptive normalization if embedding is provided
-        if emb is not None and self.emb_proj is not None:
-            emb_params = self.emb_proj(emb).view(b, 2, -1)  # (b, 2, key_dim * heads)
-            scale, shift = emb_params[:, 0], emb_params[:, 1]  # Split into scale and shift
-            # Normalize and modulate Q, K, V
-            q = self.norm(q)
-            k = self.norm(k)
-            v = self.norm(v)
-            # Apply scale and shift across spatial dimensions
-            q = q * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
-            k = k * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
-            v = v * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
-        # Reshape Q, K, V for multi-head attention
-        q = q.view(b, heads, key_dim, h * w)
-        k = k.view(b, heads, key_dim, h * w)
-        v = v.view(b, heads, self.value_dim, h * w)
-        # Scale queries and keys
-        q = q * (key_dim ** -0.25)
-        k = k * (key_dim ** -0.25)
-        # Softmax on keys along the sequence dimension
-        k = k.softmax(dim=-1)
-        if self.norm_queries:
-            q = q.softmax(dim=-2)
-        # Compute context and output
-        context = torch.einsum('bhdn,bhen->bhde', k, v)
-        out = torch.einsum('bhdn,bhde->bhen', q, context)
-        out = out.reshape(b, -1, h, w)
-        out = self.to_out(out)
-        return x + out
-class ResConvBlock(nn.Module):
-    def __init__(self, channels, time_dim):
-        super().__init__()
-        self.first_conv = nn.Conv2d(channels, channels, 3, padding=1, bias=False, padding_mode='replicate')
-        self.second_conv = nn.Conv2d(channels, channels, 3, padding=1, padding_mode='replicate')
-        self.gn1 = nn.GroupNorm(8, channels, affine=True)
-        self.gn2 = nn.GroupNorm(8, channels, affine=False)
-        self.embd_affine = nn.Linear(time_dim, channels * 2)
-        self.act = nn.LeakyReLU(inplace=True)
-    def forward(self, x, t_emb):
-        # Get affine parameters from time embedding
-        affine_params = self.embd_affine(t_emb)
-        scale, shift = affine_params.chunk(2, dim=1)
-        # First convolution path
-        h = self.first_conv(self.act(self.gn1(x)))
-        # Second convolution path with adaptive normalization
-        h = self.gn2(h)
-        h = h * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
-        h = self.second_conv(self.act(h))
-        return x + h
-class DiTLayer(nn.Module):
-    def __init__(self, d_model, embd_dim, nhead, dim_feedforward=1024):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(d_model, elementwise_affine=False)
-        self.attn = nn.MultiheadAttention(d_model, nhead, batch_first=False)
-        self.norm2 = nn.LayerNorm(d_model, elementwise_affine=False)
-        self.embd_affine = nn.Linear(embd_dim, 6*d_model)
-        self.ffn = nn.Sequential(
-            nn.Linear(d_model, dim_feedforward),
-            nn.LeakyReLU(0.2, inplace=True),
-            nn.Linear(dim_feedforward, d_model),
-        )
-    def forward(self, x, embd):
-        affine_params = self.embd_affine(embd)
-        scale1, scale2, shift1, shift2, alpha1, alpha2 = affine_params.chunk(6, dim=1)
-        # Self-attention block
-        x = self.norm1(x)
-        x = x * (1 + scale1[None, :, :]) + shift1[None, :, :]
-        attn_output, _ = self.attn(x, x, x)
-        x = x + attn_output * alpha1[None, :, :]
-        # Feedforward block
-        x = self.norm2(x)
-        x = x * (1 + scale2[None, :, :]) + shift2[None, :, :]
-        ffn_output = self.ffn(x)
-        x = x + ffn_output * alpha2[None, :, :]
-        return x
-class DiTBlock(nn.Module):
-    def __init__(self, channels, embd_dim, patch_size, nhead, num_layers):
-        super().__init__()
-        self.patch_size = patch_size
-        self.patchify = nn.Unfold(kernel_size=patch_size, stride=patch_size)
-        hidden_size = channels * patch_size**2
-        self.pos_embd = SinusoidalPositionalEmbedding2D(hidden_size)
-        self.dit_layers = nn.ModuleList([
-            DiTLayer(hidden_size, embd_dim, nhead, 2*hidden_size)
-            for _ in range(num_layers)
-        ])
-    def forward(self, x, embd):
-        B, C, H, W = x.shape
-        H_p, W_p = H // self.patch_size, W // self.patch_size
-        x = self.patchify(x).permute(0, 2, 1)  # [B, num_patches, d_main]
-        pos_embd = self.pos_embd(H_p, W_p).to(dtype=x.dtype)
-        x = x + pos_embd.unsqueeze(0)
-        x = x.permute(1, 0, 2) # [num_patches, B, d_main)
-        for dit_layer in self.dit_layers:
-            x = dit_layer(x, embd)
-        x = x.permute(1, 2, 0)  # [B, d_main, num_patches]
-        x = nn.functional.fold(x, (H, W), (self.patch_size, self.patch_size), stride=(self.patch_size, self.patch_size))
-        return x
-class UpBlock(nn.Module):
-    def __init__(self, in_ch, out_ch, time_dim, cat):
-        super().__init__()
-        self.res = ResConvBlock(in_ch, time_dim)
-        self.up = nn.ConvTranspose2d(in_ch, out_ch, 4, stride=2, padding=1)
-        self.cat = cat
-    def forward(self, x, t_emb, skip=None):
-        x = self.res(x, t_emb)
-        x = self.up(x)
-        if self.cat:
-            x = torch.cat([x, skip], dim=1)
-        else:
-            x = x + skip
-        return x
-class UpBlockWithDit(nn.Module):
-    def __init__(self, in_ch, mid_ch, out_ch, patch_size, nhead, time_dim, layers, cat):
-        super().__init__()
-        self.res = ResConvBlock(in_ch, time_dim)
-        self.down_map = nn.Conv2d(in_ch, mid_ch, kernel_size=1, bias=False)
-        self.down_norm = nn.GroupNorm(4, mid_ch)
-        self.dit = DiTBlock(mid_ch, time_dim, patch_size, nhead, layers)
-        self.up_map = nn.Conv2d(mid_ch, in_ch, kernel_size=1)
-        self.up = nn.ConvTranspose2d(in_ch, out_ch, 4, stride=2, padding=1)
-        self.cat = cat
-    def forward(self, x, embd, skip=None):
-        x = self.res(x, embd)
-        h = self.down_norm(self.down_map(x))
-        h = self.dit(h, embd)
-        h = self.up_map(h)
-        x = x + h
-        x = self.up(x)
-        if self.cat:
-            x = torch.cat([x, skip], dim=1)
-        else:
-            x = x + skip
-        return x
-def run_block(module, *args):
-    return module(*args)
-class ConditionalUNet(nn.Module):
-    def __init__(self, base_ch=16, embd_dim=64, depth=5):
-        super().__init__()
-        self.depth = depth
-        self.time_embd = SinusoidalEmbedding(embd_dim)
-        self.waterlevel_embd = SinusoidalEmbedding(embd_dim, 10)
-        embd_dim *= 2
-        # Input channels = noisy height (1) + ridge map (1) + lake map (1)
-        self.expand = nn.Conv2d(4, base_ch, 3, padding=1, padding_mode='replicate')
-        # Encoder layers
-        self.enc_blocks = nn.ModuleList()
-        self.enc_dit_blocks = nn.ModuleList()
-        self.down_convs = nn.ModuleList()
-        current_ch = base_ch
-        for i in range(depth):
-            self.enc_blocks.append(ResConvBlock(current_ch, embd_dim))
-            if i < depth - 1:
-                self.down_convs.append(
-                    nn.Conv2d(current_ch, current_ch * 2, 4, stride=2, padding=1, padding_mode='replicate')
-                )
-                current_ch *= 2
-        # Bottleneck
-        self.bottleneck = nn.Conv2d(current_ch, current_ch * 2, 4, stride=2, padding=1, padding_mode='replicate')
-        current_ch *= 2
-        # Decoder layers
-        self.up_blocks = nn.ModuleList()
-        for i in range(depth):
-            cat = (i == depth - 1)  # Only concatenate in the final up block
-            self.up_blocks.append(UpBlock(current_ch, current_ch // 2, embd_dim, cat))
-            current_ch = current_ch // 2 * (2 if cat else 1)
-        self.out = ResConvBlock(current_ch, embd_dim)
-        self.final = nn.Conv2d(current_ch, 1, 1)
-    def forward(self, x, map_average, ridge_map, basin_map, water_level, t):
-        t_embed = self.time_embd(t).to(x.dtype)
-        waterlevel_embd = self.waterlevel_embd(water_level).to(x.dtype)
-        embeds = torch.cat([t_embed, waterlevel_embd], dim=1)
-        h = torch.cat([x, ridge_map, basin_map, map_average], dim=1)
-        h = checkpoint(run_block, self.expand, h, use_reentrant=False) if self.training else self.expand(h)
-        # Encoder
-        skips = []
-        for i in range(self.depth):
-            h = checkpoint(run_block, self.enc_blocks[i], h, embeds, use_reentrant=False) if self.training else self.enc_blocks[i](h, embeds)
-            skips.append(h)
-            if i < self.depth - 1:
-                h = checkpoint(run_block, self.down_convs[i], h, use_reentrant=False) if self.training else self.down_convs[i](h)
-        # Bottleneck
-        h = checkpoint(run_block, self.bottleneck, h, use_reentrant=False) if self.training else self.bottleneck(h)
-        # Decoder
-        for i in range(self.depth):
-            h = checkpoint(run_block, self.up_blocks[i], h, embeds, skips[-(i + 1)], use_reentrant=False) if self.training else self.up_blocks[i](h, embeds, skips[-(i + 1)])
-        h = checkpoint(run_block, self.out, h, embeds, use_reentrant=False) if self.training else self.out(h, embeds)
-        h = checkpoint(run_block, self.final, h, use_reentrant=False) if self.training else self.final(h)
-        return h
-class ConditionalUNetDiT(nn.Module):
-    def __init__(self, base_ch=8, embd_dim=16):
-        super().__init__()
-        self.time_embd = SinusoidalEmbedding(embd_dim, scaling=1000)
-        self.waterlevel_embd = SinusoidalEmbedding(embd_dim, scaling=1)
-        embd_dim *= 2
-        # Input channels = noisy height (1) + ridge map (1) + lake map (1)
-        self.expand = nn.Conv2d(3, base_ch, 3, padding=1, padding_mode='replicate')
-        self.enc_0 = ResConvBlock(base_ch, embd_dim)
-        self.down0 = nn.Conv2d(base_ch, base_ch * 2, 4, stride=2, padding=1, padding_mode='replicate') # 1024->512
-        self.enc_1 = ResConvBlock(base_ch * 2, embd_dim)
-        #self.enc_1_dit = DiTBlock(base_ch * 2, 16, 1024, 8, 4)
-        self.down1 = nn.Conv2d(base_ch * 2, base_ch * 4, 4, stride=2, padding=1, padding_mode='replicate') # 512->256
-        self.up1 = UpBlockWithDit(base_ch * 4, base_ch, base_ch * 2, 8, 8, embd_dim, 6, False) # 256->512
-        self.up0 = UpBlockWithDit(base_ch * 2, base_ch//2, base_ch, 16, 16, embd_dim, 3, True) # 512->1024
-        self.out = ResConvBlock(base_ch * 2, embd_dim)
-        self.final = nn.Conv2d(base_ch * 2, 1, 1)
-    def initialize(self):
-        for name, m in self.named_modules():
-            if isinstance(m, nn.Linear) and ('embd_affine' in name or 'water_level_affine' in name):
-                m.weight.data.zero_()
-                m.bias.data.zero_()
-            if isinstance(m, nn.Conv2d) and 'second_conv' in name:
-                m.weight.data.zero_()
-                m.bias.data.zero_()
-    def forward(self, x, ridge_map, basin_map, water_level, t):
-        t_embed = self.time_embd(t).to(x.dtype)
-        waterlevel_embd = self.waterlevel_embd(water_level).to(x.dtype)
-        embeds = torch.cat([t_embed, waterlevel_embd], dim=1)
-        # x: noisy height map, ridge_map: binary edges, basin_map: binary basins, water_level: the estimate sea level
-        h0 = torch.cat([x, ridge_map, basin_map], dim=1)  # concat condition
-        # encode
-        h0 = self.expand(h0)
-        h0 = self.enc_0(h0, embeds)
-        h1 = self.down0(h0)
-        h1 = self.enc_1(h1, embeds) # 512x512
-        #h1 = checkpoint(run_block, self.enc_1_dit, h1, water_level, use_reentrant=False) if self.training else self.enc_1_dit(h1, water_level)
-        h2 = self.down1(h1)  # 256x256
-        # decode with skip connections
-        out = self.up1(h2, embeds, h1)  # 512x512
-        out = self.up0(out, embeds, h0)  # 1024x1024
-        out = self.out(out, embeds)
-        out = self.final(out)
-        return out  # predicted noise for diffusion loss
-if __name__ == "__main__":
-    #a = ConditionalUNet()
-    #t = SinusoidalEmbedding(256)
-    #t_embd = t(torch.randint(0, 100, (1,)))
-    #x = torch.randn(1, 1, 256, 256)
-    #r = torch.randn(1, 1, 256, 256)
-    #c = a(x, r, t_embd)
-    #print(c)
-    #print(c.shape)
-    network = ConditionalUNetDiT()
-    for name, m in network.named_modules():
-        if isinstance(m, nn.Linear) and 'time_affine':
-            m.weight.data.zero_()
-            m.bias.data.zero_()