Add files using upload-large-folder tool

77d636f verified 4 months ago

10.3 kB

	import torch
	import torch.nn as nn
	import math
	import numpy as np
	import torch.nn.functional as F
	# from timm.models.vision_transformer import Attention, Mlp -> handson_tims

	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class Attention(nn.Module):
	def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = head_dim ** -0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):
	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]

	# attn = (q @ k.transpose(-2, -1)) * self.scale
	# attn = attn.softmax(dim=-1)
	# attn = self.attn_drop(attn)

	# x = (attn @ v).transpose(1, 2).reshape(B, N, C)
	# x = self.proj(x)
	# x = self.proj_drop(x)

	## Replace: use Flash-Attention
	x = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0)

	x = x.transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)

	return x


	class Patch1D(nn.Module):
	"""
	[B, L, D] -> [B, L/P, D*P]
	"""
	def __init__(self, patch_size):
	super().__init__()
	self.patch_size = patch_size

	def forward(self, x):
	B, L, D = x.shape
	# Pad sequence if not divisible by patch_size
	# [B,31,4]->patch_size = 2 -> [B,16,8],pad is [x_31, padding_0,,,]
	if L % self.patch_size != 0:
	pad = self.patch_size - (L % self.patch_size)
	x = F.pad(x, (0, 0, 0, pad))

	B, L_new, D = x.shape
	# View as patches
	return x.view(B, L_new // self.patch_size, D * self.patch_size)

	class Unpatch1D(nn.Module):
	"""
	[B, L/P, D*P] -> [B, L, D]
	"""
	def __init__(self, patch_size):
	super().__init__()
	self.patch_size = patch_size

	def forward(self, x):
	B, L_new, DP = x.shape
	return x.view(B, L_new * self.patch_size, DP // self.patch_size)

	### 这里DiT的pos_embed没有使用到三角函数；另外，没有forward_with_cfg的函数实现 -> 暂时没有label_embedding
	## from: https://github.com/willisma/SiT/blob/main/models.py
	class TimestepEmbedder(nn.Module):
	"""Sinusoidal Time Embeddings"""
	def __init__(self, hidden_size, frequency_embedding_size=256):
	super().__init__()
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, hidden_size,bias=True),
	nn.SiLU(),
	nn.Linear(hidden_size, hidden_size,bias=True),
	)
	self.frequency_embedding_size = frequency_embedding_size

	@staticmethod
	def timestep_embedding(t, dim, max_period=10000):
	"""
	Create sinusoidal timestep embeddings.
	:param t: a 1-D Tensor of N indices, one per batch element.
	These may be fractional.
	:param dim: the dimension of the output.
	:param max_period: controls the minimum frequency of the embeddings.
	:return: an (N, D) Tensor of positional embeddings.
	"""
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
	## 兼容更多的 t 格式
	if t.ndim > 1:
	t = t.view(-1)

	half = dim // 2
	freqs = torch.exp(
	-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
	).to(device=t.device)
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
	return embedding

	def forward(self, t):
	t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
	t_emb = self.mlp(t_freq)
	return t_emb


	def modulate(x, shift, scale):
	return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)

	## DiTBlock, adaptive layer norm conditioning
	class DiTBlock(nn.Module):
	"""Transformer Block with Adaptive Layer Norm (adaLN)"""
	def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
	super().__init__()
	self.hidden_size = hidden_size
	self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True)
	self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
	mlp_hidden_dim = int(hidden_size * mlp_ratio)
	approx_gelu = lambda: nn.GELU(approximate="tanh")
	self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(),
	nn.Linear(hidden_size, 6 * hidden_size, bias=True)
	)

	def forward(self, x, c):
	# c shape: [B, hidden_size]
	# adaLN_out shape 应该是 [B, 6 * hidden_size]
	adaLN_out = self.adaLN_modulation(c)

	# --- Debug 探针 (如果再次报错，请查看这里打印的形状) ---
	if adaLN_out.shape[1] != 6 * self.hidden_size:
	print(f"⚠️ DiTBlock Shape Error!")
	print(f"Input c shape: {c.shape}")
	print(f"adaLN output shape: {adaLN_out.shape}")
	print(f"Expected dim1: {6 * self.hidden_size}")
	raise ValueError("adaLN output dimension mismatch!")
	# ----------------------------------------------------

	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = adaLN_out.chunk(6, dim=1)
	x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
	x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
	return x

	class PatchedFlowDiT(nn.Module):
	"""
	Main DiT Architecture for Flow Matching
	Input: z_t (Noisy Latent) + t (Time) + condition (Original Latent)
	Output: velocity vector
	"""
	def __init__(self, cfg):
	super().__init__()
	self.cfg = cfg
	## add patch and unpatch block here
	self.patcher = Patch1D(cfg.patch_size)
	self.unpatcher = Unpatch1D(cfg.patch_size)

	# 计算 Patch 后的输入维度
	# Input to DiT = Patch(z_t) + Patch(Condition)
	# 维度 = (Latent * Patch) * 2
	input_feat_dim = cfg.latent_dim * cfg.patch_size

	# Projection to DiT Hidden Size
	self.input_proj = nn.Linear(input_feat_dim * 2, cfg.dit_hidden)

	# Time & Pos Embeddings
	self.time_embed = TimestepEmbedder(cfg.dit_hidden)
	patched_len = (cfg.max_seq_len + cfg.patch_size - 1) // cfg.patch_size
	self.pos_embed = nn.Parameter(torch.zeros(1, patched_len, cfg.dit_hidden))

	self.blocks = nn.ModuleList([
	DiTBlock(cfg.dit_hidden, cfg.dit_heads) for _ in range(cfg.dit_layers)
	])

	# Output Projection (Predict Velocity)
	self.final_layer = nn.Linear(cfg.dit_hidden, input_feat_dim)
	self.initialize_weights()

	def initialize_weights(self):
	# Initialize transformer layers:
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	self.apply(_basic_init)

	# Initialize pos_embed
	nn.init.normal_(self.pos_embed, std=0.02)

	# Zero-out adaLN modulation layers
	for block in self.blocks:
	nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

	# Zero-out final layer -> modify: to predict data, so initialize is xavier or normal
	# nn.init.constant_(self.final_layer.weight, 0)
	# nn.init.constant_(self.final_layer.bias, 0)
	nn.init.xavier_uniform_(self.final_layer.weight)
	nn.init.constant_(self.final_layer.bias, 0)

	def forward(self, z_t, t, condition):
	# x: [Batch, Seq, Dim]
	# t: [Batch]
	# condition: [Batch, Seq, Dim] (Optional, e.g., Source Sentence)
	"""
	z_t: [B, L, D]
	condition: [B, L, D]
	"""
	# 1. Patching
	z_p = self.patcher(z_t)
	c_p = self.patcher(condition)

	# 2. Concat & Project(Jit Style: Explicit Conditioning)
	x = torch.cat([z_p, c_p], dim=-1)
	x = self.input_proj(x)

	# 3. Add Embeddings
	t_emb = self.time_embed(t)
	# Handle length mismatch due to padding
	L_curr = x.shape[1]
	x = x + self.pos_embed[:, :L_curr, :]

	# 4. Transformer
	for block in self.blocks:
	x = block(x, t_emb)

	# 5. Output & Unpatch
	v_p = self.final_layer(x)
	v = self.unpatcher(v_p)

	# Crop to original length
	return v[:, :z_t.shape[1], :]

	def forward_with_cfg(self, x, t, condition, cfg_scale):
	"""
	支持 Classifier-Free Guidance 的前向传播
	"""
	# 1. condition
	cond_out = self.forward(x, t, condition)

	# 2.uncondition
	uncond_out = self.forward(x, t, condition=None)

	# 3. classifier-free guidance
	# eps = eps_uncond + s * (eps_cond - eps_uncond)
	return uncond_out + cfg_scale * (cond_out - uncond_out)