Upload modeling_wan_camera.py with huggingface_hub

795c1a4 verified 2 months ago

8.61 kB

	"""
	WanTransformer3DModel with camera control adapter for Wan2.1-Fun-V1.1-1.3B-Control-Camera.

	The camera adapter processes 6-channel Plucker ray embeddings (temporally packed
	to 24 channels) and additively injects them into the patch-embedded latents.
	Architecture matches VideoX-Fun's SimpleAdapter exactly.

	Usage:
	from modeling_wan_camera import WanCameraControlTransformer3DModel
	model = WanCameraControlTransformer3DModel.from_pretrained("path/to/transformer")
	"""

	import math
	from typing import Any

	import torch
	import torch.nn as nn

	from diffusers import WanTransformer3DModel
	from diffusers.configuration_utils import register_to_config
	from diffusers.models.modeling_outputs import Transformer2DModelOutput
	from diffusers.utils import apply_lora_scale


	class ResidualBlock(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.conv1 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
	self.relu = nn.ReLU(inplace=True)
	self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)

	def forward(self, x):
	residual = x
	out = self.relu(self.conv1(x))
	out = self.conv2(out)
	out += residual
	return out


	class CameraControlAdapter(nn.Module):
	"""
	Processes per-frame Plucker ray embeddings into features matching the
	transformer's patch-embedded latent shape.

	Pipeline: PixelUnshuffle(8) -> Conv2d -> ResidualBlock(s)
	Input: [B, 24, F, H_pixel, W_pixel] (pixel-resolution camera embeddings)
	Output: [B, inner_dim, F, H_latent/p_h, W_latent/p_w] (matches patch_embedding output)
	"""

	def __init__(self, in_channels, out_channels, kernel_size, stride,
	downscale_factor=8, num_residual_blocks=1):
	super().__init__()
	self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=downscale_factor)
	self.conv = nn.Conv2d(
	in_channels * downscale_factor * downscale_factor,
	out_channels,
	kernel_size=kernel_size,
	stride=stride,
	padding=0,
	)
	self.residual_blocks = nn.Sequential(
	*[ResidualBlock(out_channels) for _ in range(num_residual_blocks)]
	)

	def forward(self, x):
	bs, c, f, h, w = x.size()
	x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
	x = self.pixel_unshuffle(x)
	x = self.conv(x)
	x = self.residual_blocks(x)
	x = x.view(bs, f, x.size(1), x.size(2), x.size(3))
	x = x.permute(0, 2, 1, 3, 4)
	return x


	class WanCameraControlTransformer3DModel(WanTransformer3DModel):
	"""
	WanTransformer3DModel + camera control adapter.

	The adapter output is added to patch-embedded latents before the transformer
	blocks, matching VideoX-Fun's y_camera injection point.

	Extra config params vs base WanTransformer3DModel:
	control_adapter_in_channels (int): Input channels for camera embeddings (default 24)
	control_adapter_downscale_factor (int): PixelUnshuffle factor (default 8)
	control_adapter_num_residual_blocks (int): Number of residual blocks (default 1)

	Extra forward param:
	control_camera_video (Tensor \| None): [B, 24, F, H_px, W_px] Plucker ray embeddings
	"""

	@register_to_config
	def __init__(
	self,
	# Base WanTransformer3DModel params (must be explicit for @register_to_config)
	patch_size: tuple[int, ...] = (1, 2, 2),
	num_attention_heads: int = 12,
	attention_head_dim: int = 128,
	in_channels: int = 32,
	out_channels: int = 16,
	text_dim: int = 4096,
	freq_dim: int = 256,
	ffn_dim: int = 8960,
	num_layers: int = 30,
	cross_attn_norm: bool = True,
	qk_norm: str \| None = "rms_norm_across_heads",
	eps: float = 1e-6,
	image_dim: int \| None = 1280,
	added_kv_proj_dim: int \| None = 1536,
	rope_max_seq_len: int = 1024,
	pos_embed_seq_len: int \| None = None,
	# Camera adapter params
	control_adapter_in_channels: int = 24,
	control_adapter_downscale_factor: int = 8,
	control_adapter_num_residual_blocks: int = 1,
	):
	super().__init__(
	patch_size=patch_size,
	num_attention_heads=num_attention_heads,
	attention_head_dim=attention_head_dim,
	in_channels=in_channels,
	out_channels=out_channels,
	text_dim=text_dim,
	freq_dim=freq_dim,
	ffn_dim=ffn_dim,
	num_layers=num_layers,
	cross_attn_norm=cross_attn_norm,
	qk_norm=qk_norm,
	eps=eps,
	image_dim=image_dim,
	added_kv_proj_dim=added_kv_proj_dim,
	rope_max_seq_len=rope_max_seq_len,
	pos_embed_seq_len=pos_embed_seq_len,
	)
	inner_dim = num_attention_heads * attention_head_dim
	ps = patch_size if isinstance(patch_size, (list, tuple)) else [patch_size] * 3
	self.control_adapter = CameraControlAdapter(
	in_channels=control_adapter_in_channels,
	out_channels=inner_dim,
	kernel_size=tuple(ps[1:]),
	stride=tuple(ps[1:]),
	downscale_factor=control_adapter_downscale_factor,
	num_residual_blocks=control_adapter_num_residual_blocks,
	)

	@apply_lora_scale("attention_kwargs")
	def forward(
	self,
	hidden_states: torch.Tensor,
	timestep: torch.LongTensor,
	encoder_hidden_states: torch.Tensor,
	encoder_hidden_states_image: torch.Tensor \| None = None,
	control_camera_video: torch.Tensor \| None = None,
	return_dict: bool = True,
	attention_kwargs: dict[str, Any] \| None = None,
	) -> torch.Tensor \| dict[str, torch.Tensor]:
	batch_size, num_channels, num_frames, height, width = hidden_states.shape
	p_t, p_h, p_w = self.config.patch_size
	post_patch_num_frames = num_frames // p_t
	post_patch_height = height // p_h
	post_patch_width = width // p_w

	rotary_emb = self.rope(hidden_states)

	hidden_states = self.patch_embedding(hidden_states)

	# Camera adapter: additive injection after patch embedding
	if control_camera_video is not None:
	hidden_states = hidden_states + self.control_adapter(control_camera_video)

	hidden_states = hidden_states.flatten(2).transpose(1, 2)

	if timestep.ndim == 2:
	ts_seq_len = timestep.shape[1]
	timestep = timestep.flatten()
	else:
	ts_seq_len = None

	temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
	timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len
	)
	if ts_seq_len is not None:
	timestep_proj = timestep_proj.unflatten(2, (6, -1))
	else:
	timestep_proj = timestep_proj.unflatten(1, (6, -1))

	if encoder_hidden_states_image is not None:
	encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)

	if torch.is_grad_enabled() and self.gradient_checkpointing:
	for block in self.blocks:
	hidden_states = self._gradient_checkpointing_func(
	block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb
	)
	else:
	for block in self.blocks:
	hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb)

	if temb.ndim == 3:
	shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
	shift = shift.squeeze(2)
	scale = scale.squeeze(2)
	else:
	shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)

	shift = shift.to(hidden_states.device)
	scale = scale.to(hidden_states.device)

	hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
	hidden_states = self.proj_out(hidden_states)

	hidden_states = hidden_states.reshape(
	batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1
	)
	hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
	output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)

	if not return_dict:
	return (output,)

	return Transformer2DModelOutput(sample=output)