Diff-Refine / src /models /dit.py
2ira's picture
Add files using upload-large-folder tool
77d636f verified
import torch
import torch.nn as nn
import math
import numpy as np
import torch.nn.functional as F
# from timm.models.vision_transformer import Attention, Mlp -> handson_tims
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
# attn = (q @ k.transpose(-2, -1)) * self.scale
# attn = attn.softmax(dim=-1)
# attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape(B, N, C)
# x = self.proj(x)
# x = self.proj_drop(x)
## Replace: use Flash-Attention
x = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0)
x = x.transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Patch1D(nn.Module):
"""
[B, L, D] -> [B, L/P, D*P]
"""
def __init__(self, patch_size):
super().__init__()
self.patch_size = patch_size
def forward(self, x):
B, L, D = x.shape
# Pad sequence if not divisible by patch_size
# [B,31,4]->patch_size = 2 -> [B,16,8],pad is [x_31, padding_0,,,]
if L % self.patch_size != 0:
pad = self.patch_size - (L % self.patch_size)
x = F.pad(x, (0, 0, 0, pad))
B, L_new, D = x.shape
# View as patches
return x.view(B, L_new // self.patch_size, D * self.patch_size)
class Unpatch1D(nn.Module):
"""
[B, L/P, D*P] -> [B, L, D]
"""
def __init__(self, patch_size):
super().__init__()
self.patch_size = patch_size
def forward(self, x):
B, L_new, DP = x.shape
return x.view(B, L_new * self.patch_size, DP // self.patch_size)
### 这里DiT的pos_embed没有使用到三角函数;另外,没有forward_with_cfg的函数实现 -> 暂时没有label_embedding
## from: https://github.com/willisma/SiT/blob/main/models.py
class TimestepEmbedder(nn.Module):
"""Sinusoidal Time Embeddings"""
def __init__(self, hidden_size, frequency_embedding_size=256):
super().__init__()
self.mlp = nn.Sequential(
nn.Linear(frequency_embedding_size, hidden_size,bias=True),
nn.SiLU(),
nn.Linear(hidden_size, hidden_size,bias=True),
)
self.frequency_embedding_size = frequency_embedding_size
@staticmethod
def timestep_embedding(t, dim, max_period=10000):
"""
Create sinusoidal timestep embeddings.
:param t: a 1-D Tensor of N indices, one per batch element.
These may be fractional.
:param dim: the dimension of the output.
:param max_period: controls the minimum frequency of the embeddings.
:return: an (N, D) Tensor of positional embeddings.
"""
# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
## 兼容更多的 t 格式
if t.ndim > 1:
t = t.view(-1)
half = dim // 2
freqs = torch.exp(
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
).to(device=t.device)
args = t[:, None].float() * freqs[None]
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
if dim % 2:
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
return embedding
def forward(self, t):
t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
t_emb = self.mlp(t_freq)
return t_emb
def modulate(x, shift, scale):
return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
## DiTBlock, adaptive layer norm conditioning
class DiTBlock(nn.Module):
"""Transformer Block with Adaptive Layer Norm (adaLN)"""
def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
super().__init__()
self.hidden_size = hidden_size
self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True)
self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
mlp_hidden_dim = int(hidden_size * mlp_ratio)
approx_gelu = lambda: nn.GELU(approximate="tanh")
self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
nn.Linear(hidden_size, 6 * hidden_size, bias=True)
)
def forward(self, x, c):
# c shape: [B, hidden_size]
# adaLN_out shape 应该是 [B, 6 * hidden_size]
adaLN_out = self.adaLN_modulation(c)
# --- Debug 探针 (如果再次报错,请查看这里打印的形状) ---
if adaLN_out.shape[1] != 6 * self.hidden_size:
print(f"⚠️ DiTBlock Shape Error!")
print(f"Input c shape: {c.shape}")
print(f"adaLN output shape: {adaLN_out.shape}")
print(f"Expected dim1: {6 * self.hidden_size}")
raise ValueError("adaLN output dimension mismatch!")
# ----------------------------------------------------
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = adaLN_out.chunk(6, dim=1)
x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
return x
class PatchedFlowDiT(nn.Module):
"""
Main DiT Architecture for Flow Matching
Input: z_t (Noisy Latent) + t (Time) + condition (Original Latent)
Output: velocity vector
"""
def __init__(self, cfg):
super().__init__()
self.cfg = cfg
## add patch and unpatch block here
self.patcher = Patch1D(cfg.patch_size)
self.unpatcher = Unpatch1D(cfg.patch_size)
# 计算 Patch 后的输入维度
# Input to DiT = Patch(z_t) + Patch(Condition)
# 维度 = (Latent * Patch) * 2
input_feat_dim = cfg.latent_dim * cfg.patch_size
# Projection to DiT Hidden Size
self.input_proj = nn.Linear(input_feat_dim * 2, cfg.dit_hidden)
# Time & Pos Embeddings
self.time_embed = TimestepEmbedder(cfg.dit_hidden)
patched_len = (cfg.max_seq_len + cfg.patch_size - 1) // cfg.patch_size
self.pos_embed = nn.Parameter(torch.zeros(1, patched_len, cfg.dit_hidden))
self.blocks = nn.ModuleList([
DiTBlock(cfg.dit_hidden, cfg.dit_heads) for _ in range(cfg.dit_layers)
])
# Output Projection (Predict Velocity)
self.final_layer = nn.Linear(cfg.dit_hidden, input_feat_dim)
self.initialize_weights()
def initialize_weights(self):
# Initialize transformer layers:
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
# Initialize pos_embed
nn.init.normal_(self.pos_embed, std=0.02)
# Zero-out adaLN modulation layers
for block in self.blocks:
nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
# Zero-out final layer -> modify: to predict data, so initialize is xavier or normal
# nn.init.constant_(self.final_layer.weight, 0)
# nn.init.constant_(self.final_layer.bias, 0)
nn.init.xavier_uniform_(self.final_layer.weight)
nn.init.constant_(self.final_layer.bias, 0)
def forward(self, z_t, t, condition):
# x: [Batch, Seq, Dim]
# t: [Batch]
# condition: [Batch, Seq, Dim] (Optional, e.g., Source Sentence)
"""
z_t: [B, L, D]
condition: [B, L, D]
"""
# 1. Patching
z_p = self.patcher(z_t)
c_p = self.patcher(condition)
# 2. Concat & Project(Jit Style: Explicit Conditioning)
x = torch.cat([z_p, c_p], dim=-1)
x = self.input_proj(x)
# 3. Add Embeddings
t_emb = self.time_embed(t)
# Handle length mismatch due to padding
L_curr = x.shape[1]
x = x + self.pos_embed[:, :L_curr, :]
# 4. Transformer
for block in self.blocks:
x = block(x, t_emb)
# 5. Output & Unpatch
v_p = self.final_layer(x)
v = self.unpatcher(v_p)
# Crop to original length
return v[:, :z_t.shape[1], :]
def forward_with_cfg(self, x, t, condition, cfg_scale):
"""
支持 Classifier-Free Guidance 的前向传播
"""
# 1. condition
cond_out = self.forward(x, t, condition)
# 2.uncondition
uncond_out = self.forward(x, t, condition=None)
# 3. classifier-free guidance
# eps = eps_uncond + s * (eps_cond - eps_uncond)
return uncond_out + cfg_scale * (cond_out - uncond_out)