Upload folder using huggingface_hub

f17ae24 verified 9 days ago

16.5 kB

	# dit for video from: https://github.com/world-model-eval/world-model-eval/blob/master/src/world_model_eval/model.py

	import torch
	from torch import nn
	import torch.nn.functional as F
	import einops
	import math
	import functools
	from typing import Sequence, Optional, Union, Dict, Tuple
	import sys
	from enum import Enum

	class StrEnum(str, Enum):
	def __str__(self):
	return str(self.value)


	class AttentionType(StrEnum):
	SPATIAL = "spatial"
	TEMPORAL = "temporal"


	class RotaryType(StrEnum):
	STANDARD = "standard"
	PIXEL = "pixel"


	@functools.lru_cache
	def rope_nd(
	shape: Sequence[int],
	dim: int = 64,
	base: float = 10_000.0,
	rotary_type: RotaryType = RotaryType.STANDARD,
	*,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	) -> torch.Tensor:
	D = len(shape)
	assert dim % (2 * D) == 0, (
	f"`dim` must be divisible by 2 × D (got dim={dim}, D={D})"
	)

	dim_per_axis = dim // D
	half = dim_per_axis // 2
	if rotary_type == RotaryType.STANDARD:
	inv_freq = 1.0 / (
	base ** (torch.arange(half, device=device, dtype=dtype) / half)
	)
	coords = [torch.arange(n, device=device, dtype=dtype) for n in shape]
	elif rotary_type == RotaryType.PIXEL:
	inv_freq = (
	torch.linspace(1.0, 256.0 / 2, half, device=device, dtype=dtype) * math.pi
	)
	coords = [
	torch.linspace(-1, +1, steps=n, device=device, dtype=dtype) for n in shape
	]
	else:
	raise NotImplementedError(f"invalid rotary type: {rotary_type}")

	mesh = torch.meshgrid(*coords, indexing="ij")

	embeddings = []
	for pos in mesh:
	theta = pos.unsqueeze(-1) * inv_freq
	emb_axis = torch.cat([torch.cos(theta), torch.sin(theta)], dim=-1)
	embeddings.append(emb_axis)
	return torch.cat(embeddings, dim=-1)


	def rotate_half(x: torch.Tensor) -> torch.Tensor:
	x = x.view(*x.shape[:-1], -1, 2)
	x1, x2 = x.unbind(-1)
	return torch.stack((-x2, x1), dim=-1).flatten(-2)


	def rope_mix(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	cos = torch.repeat_interleave(cos, 2, dim=-1)
	sin = torch.repeat_interleave(sin, 2, dim=-1)
	return x * cos + rotate_half(x) * sin


	def apply_rope_nd(
	q: torch.Tensor,
	k: torch.Tensor,
	shape: Tuple[int, ...],
	rotary_type: RotaryType,
	*,
	base: float = 10_000.0,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	dim = q.shape[-1]
	rope = rope_nd(
	shape, dim, base, rotary_type=rotary_type, dtype=q.dtype, device=q.device
	)
	rope = rope.view(*shape, len(shape), 2, -1)
	cos, sin = rope.unbind(-2)
	cos = cos.reshape(*shape, -1)
	sin = sin.reshape(*shape, -1)

	q_rot = rope_mix(q, cos, sin)
	k_rot = rope_mix(k, cos, sin)
	return q_rot, k_rot


	class FinalLayer(nn.Module):
	def __init__(self, dim: int, patch_size: int, out_channels: int) -> None:
	super().__init__()
	self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
	self.linear = nn.Linear(dim, patch_size * patch_size * out_channels, bias=True)
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(), nn.Linear(dim, dim * 2, bias=True)
	)

	def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
	_, _, H, W, _ = x.shape
	m = self.adaLN_modulation(c)
	m = einops.repeat(m, "b t d -> b t h w d", h=H, w=W).chunk(2, dim=-1)
	x = self.linear(self.norm(x) * (1 + m[1]) + m[0])
	return x


	class Attention(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int,
	is_causal: bool,
	attention_type: AttentionType,
	rotary_type: RotaryType = RotaryType.STANDARD,
	) -> None:
	super().__init__()
	assert dim % num_heads == 0
	self.num_heads = num_heads
	self.dim = dim
	self.is_causal = is_causal
	self.attention_type = attention_type
	self.rotary_type = rotary_type
	self.qkv_proj = nn.Linear(dim, dim * 3, bias=False)
	self.out_proj = nn.Linear(dim, dim)

	def forward(self, x: torch.Tensor):
	B, T, H, W, D = x.shape

	if self.attention_type == AttentionType.SPATIAL:
	x = einops.rearrange(x, "b t h w d -> (b t) h w d")
	elif self.attention_type == AttentionType.TEMPORAL:
	x = einops.rearrange(x, "b t h w d -> (b h w) t d")
	else:
	raise NotImplementedError(f"invalid attention type: {self.attention_type}")
	sequence_shape = x.shape[1:-1]

	q, k, v = self.qkv_proj(x).chunk(3, dim=-1)
	q = einops.rearrange(q, "B ... (head d) -> B head ... d", head=self.num_heads)
	k = einops.rearrange(k, "B ... (head d) -> B head ... d", head=self.num_heads)
	v = einops.rearrange(v, "B ... (head d) -> B head ... d", head=self.num_heads)

	q, k = apply_rope_nd(q, k, sequence_shape, rotary_type=self.rotary_type)
	# Flatten the sequence dimension
	q = einops.rearrange(q, "B head ... d -> B head (...) d")
	k = einops.rearrange(k, "B head ... d -> B head (...) d")
	v = einops.rearrange(v, "B head ... d -> B head (...) d")

	x = F.scaled_dot_product_attention(q, k, v, is_causal=self.is_causal)
	x = einops.rearrange(x, "B head seq d -> B seq (head d)")
	x = self.out_proj(x)

	if self.attention_type == AttentionType.SPATIAL:
	x = einops.rearrange(x, "(b t) (h w) d -> b t h w d", t=T, h=H, w=W)
	elif self.attention_type == AttentionType.TEMPORAL:
	x = einops.rearrange(x, "(b h w) t d -> b t h w d", h=H, w=W)
	return x


	class DiTBlock(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int,
	attention_type: AttentionType,
	rotary_type: RotaryType,
	is_causal: bool,
	) -> None:
	super().__init__()
	self.adaLN_modulation = nn.Sequential(
	nn.SiLU(), nn.Linear(dim, dim * 6, bias=True)
	)
	self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
	self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
	self.attn = Attention(
	dim,
	num_heads,
	is_causal=is_causal,
	attention_type=attention_type,
	rotary_type=rotary_type,
	)
	self.ffwd = nn.Sequential(
	nn.Linear(dim, dim * 4),
	nn.GELU(approximate="tanh"),
	nn.Linear(dim * 4, dim),
	)

	def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
	_, _, H, W, _ = x.shape
	m = self.adaLN_modulation(c)
	m = einops.repeat(m, "b t d -> b t h w d", h=H, w=W).chunk(6, dim=-1)
	x = x + self.attn(self.norm1(x) * (1 + m[1]) + m[0]) * m[2]
	x = x + self.ffwd(self.norm2(x) * (1 + m[4]) + m[3]) * m[5]
	return x


	class Block(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int,
	rope_config: Optional[Dict[AttentionType, RotaryType]] = None,
	temporal_causal: bool = True,
	) -> None:
	super().__init__()
	self.s_block = DiTBlock(
	dim,
	num_heads,
	is_causal=False,
	attention_type=AttentionType.SPATIAL,
	rotary_type=rope_config[AttentionType.SPATIAL]
	if rope_config
	else RotaryType.STANDARD,
	)
	self.t_block = DiTBlock(
	dim,
	num_heads,
	is_causal=temporal_causal,
	attention_type=AttentionType.TEMPORAL,
	rotary_type=rope_config[AttentionType.TEMPORAL]
	if rope_config
	else RotaryType.STANDARD,
	)

	def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
	x = self.s_block(x, c)
	x = self.t_block(x, c)
	return x


	class ActionEmbedder(nn.Module):
	def __init__(self, action_dim: int, dim: int, compress_rate: int = 4):
	super().__init__()
	self.compress_rate = compress_rate
	self.mlp_in = nn.Sequential(
	nn.Linear(action_dim, dim),
	nn.SiLU(),
	nn.Linear(dim, dim),
	)

	if compress_rate == 4:
	self.downsample = nn.Sequential(
	nn.Conv1d(dim, dim, kernel_size=3, stride=2, padding=1),
	nn.SiLU(),
	nn.Conv1d(dim, dim, kernel_size=3, stride=2, padding=1),
	)
	elif compress_rate == 2:
	self.downsample = nn.Sequential(
	nn.Conv1d(dim, dim, kernel_size=3, stride=2, padding=1),
	)
	else:
	self.downsample = nn.Identity()

	self.mlp_out = nn.Sequential(
	nn.SiLU(),
	nn.Linear(dim, dim),
	)

	def forward(self, action: torch.Tensor) -> torch.Tensor:
	# action: [B, L, action_dim] where L = compress_rate * (T-1) + 1
	x = self.mlp_in(action) # [B, L, dim]

	if self.compress_rate > 1:
	x = x.permute(0, 2, 1) # [B, dim, L]
	x = self.downsample(x) # [B, dim, T]
	x = x.permute(0, 2, 1) # [B, T, dim]

	x = self.mlp_out(x) # [B, T, dim]
	return x


	class DiT(nn.Module):
	def __init__(
	self,
	in_channels: int = 4,
	patch_size: int = 2,
	dim: int = 1152,
	num_layers: int = 28,
	num_heads: int = 16,
	action_dim: int = 0,
	action_compress_rate: int = 4,
	max_frames: int = 16,
	rope_config: Optional[Dict[AttentionType, RotaryType]] = None,
	action_dropout_prob: float = 0.1,
	temporal_causal: bool = True,
	) -> None:
	super().__init__()
	self.in_channels = in_channels
	self.patch_size = patch_size
	self.action_dim = action_dim
	self.action_compress_rate = action_compress_rate
	self.action_dropout_prob = action_dropout_prob
	self.x_proj = nn.Conv2d(
	in_channels, dim, kernel_size=patch_size, stride=patch_size
	)
	self.timestep_mlp = nn.Sequential(
	nn.Linear(256, dim, bias=True),
	nn.SiLU(),
	nn.Linear(dim, dim, bias=True),
	)
	self.action_embedder = ActionEmbedder(action_dim, dim, compress_rate=action_compress_rate)
	self.blocks = nn.ModuleList(
	[Block(dim, num_heads, rope_config, temporal_causal=temporal_causal) for _ in range(num_layers)]
	)
	self.final_layer = FinalLayer(dim, patch_size, in_channels)
	self.max_frames = max_frames
	self.initialize_weights()

	def timestep_embedding(
	self, t: torch.Tensor, dim: int = 256, max_period: int = 10000
	) -> torch.Tensor:
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
	half = dim // 2
	freqs = torch.exp(
	-math.log(max_period)
	* torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
	/ half
	)
	args = t[:, None].float() * freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	if dim % 2:
	embedding = torch.cat(
	[embedding, torch.zeros_like(embedding[:, :1])], dim=-1
	)
	return embedding

	def initialize_weights(self) -> None:
	# Initialize transformer layers:
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)

	self.apply(_basic_init)

	# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
	w = self.x_proj.weight.data
	nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
	nn.init.constant_(self.x_proj.bias, 0)

	# Initialize timestep embedding MLP:
	nn.init.normal_(self.timestep_mlp[0].weight, std=0.02)
	nn.init.normal_(self.timestep_mlp[2].weight, std=0.02)

	# Initialize action embedder:
	for module in self.action_embedder.modules():
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, std=0.02)
	elif isinstance(module, nn.Conv1d):
	nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')

	# Zero-out adaLN modulation layers in DiT blocks:
	for block in self.blocks:
	nn.init.constant_(block.s_block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.s_block.adaLN_modulation[-1].bias, 0)
	nn.init.constant_(block.t_block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.t_block.adaLN_modulation[-1].bias, 0)

	# Zero-out output layers:
	nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
	nn.init.constant_(self.final_layer.linear.weight, 0)
	nn.init.constant_(self.final_layer.linear.bias, 0)

	def patchify(self, x: torch.Tensor) -> torch.Tensor:
	B, T, H, W, C = x.shape
	x = einops.rearrange(x, "b t h w c -> (b t) c h w")
	x = self.x_proj(x)
	x = einops.rearrange(x, "(b t) d h w -> b t h w d", t=T)
	return x

	def unpatchify(self, x: torch.Tensor) -> torch.Tensor:
	return einops.rearrange(
	x,
	"b h w (p1 p2 c) -> b (h p1) (w p2) c",
	p1=self.patch_size,
	p2=self.patch_size,
	c=self.in_channels,
	)

	def get_null_cond(self, action: torch.Tensor) -> torch.Tensor:
	null_action = torch.zeros_like(action)
	# NOTE: all-zero action is still conditional (meaning "do not move"), so we
	# need to reserve the last component of the action vector to indicate null.
	null_action[..., -1] = 1
	return null_action

	def get_cond(self, t: torch.Tensor, action: torch.Tensor) -> torch.Tensor:
	B, T = t.shape
	t = einops.rearrange(t, "b t -> (b t)")
	t_freq = self.timestep_embedding(t)
	c = self.timestep_mlp(t_freq)
	c = einops.rearrange(c, "(b t) d -> b t d", t=T)
	if self.training and self.action_dropout_prob > 0:
	should_drop = torch.rand((B, 1, 1), device=action.device) < self.action_dropout_prob
	null_action = self.get_null_cond(action)
	action = torch.where(should_drop, null_action, action)
	c += self.action_embedder(action)
	return c

	def forward(
	self, x: torch.Tensor, t: torch.Tensor, action: torch.Tensor
	) -> torch.Tensor:
	B, T, H, W, C = x.shape
	x = self.patchify(x)
	c = self.get_cond(t, action)
	for block in self.blocks:
	x = block(x, c)
	x = self.final_layer(x, c)
	x = einops.rearrange(x, "b t h w d -> (b t) h w d")
	x = self.unpatchify(x)
	x = einops.rearrange(x, "(b t) h w c -> b t h w c", t=T)
	return x

	if __name__ == "__main__":
	# Test DiT instantiation and forward pass
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Configure RoPE for both spatial and temporal attention
	rope_config = {
	AttentionType.SPATIAL: RotaryType.STANDARD,
	AttentionType.TEMPORAL: RotaryType.STANDARD
	}

	# Initialize a small DiT model for testing (bidirectional temporal attention)
	model = DiT(
	in_channels=4, # e.g., latent channels
	patch_size=2,
	dim=256, # hidden dimension
	num_layers=4,
	num_heads=8,
	action_dim=16,
	max_frames=16,
	rope_config=rope_config,
	temporal_causal=False # Test bidirectional temporal attention
	).to(device)

	# Dummy inputs: (B, T, H, W, C)
	B, T, H, W, C = 2, 9, 32, 32, 4
	x = torch.randn(B, T, H, W, C).to(device)
	t = torch.randint(0, 1000, (B, T)).to(device)

	# Action shape should be (B, 4*(T-1)+1, action_dim) for compress_rate=4
	L = 4 * (T - 1) + 1
	action = torch.randn(B, L, 16).to(device)

	print(f"Running forward pass on device: {device}...")
	output = model(x, t, action)

	print(f"Input shape: {x.shape}")
	print(f"Timestep shape: {t.shape}")
	print(f"Action shape: {action.shape}")
	print(f"Output shape: {output.shape}")

	assert output.shape == x.shape, "Output shape mismatch!"
	print("Forward pass successful!")