Initial commit

2701c9d verified about 1 month ago

17.7 kB

	"""
	Latent adapter model: maps LTX-2 latent space (128ch) → WAN 2.1 latent space (16ch).

	Handles:
	- Channel reduction: 128 → 16
	- Spatial upsampling: 4× (LTX uses 32× spatial downscale, WAN uses 8×)
	- Temporal upsampling: ~2× (LTX uses 8× temporal downscale, WAN uses 4×)

	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class CausalConv3d(nn.Module):
	"""3D Convolution with causal padding in the temporal dimension."""
	def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, **kwargs):
	super().__init__()
	self.temporal_pad = kernel_size[0] - 1 if isinstance(kernel_size, tuple) else kernel_size - 1
	spatial_pad = padding[1] if isinstance(padding, tuple) else padding

	self.conv = nn.Conv3d(
	in_channels, out_channels,
	kernel_size=kernel_size,
	padding=(0, spatial_pad, spatial_pad),
	**kwargs
	)

	def forward(self, x):
	# x is (B, C, T, H, W)
	# Pad temporal dimension (left: temporal_pad, right: 0)
	if self.temporal_pad > 0:
	x = F.pad(x, (0, 0, 0, 0, self.temporal_pad, 0))
	return self.conv(x)


	class CausalConvTranspose1d(nn.Module):
	"""Causal temporal upsampling via ConvTranspose3d."""
	def __init__(self, channels, stride=2):
	super().__init__()
	self.conv = nn.ConvTranspose3d(
	channels, channels,
	kernel_size=(3,1,1), stride=(stride,1,1),
	padding=(0,0,0), # no padding, we'll crop
	)
	self.stride = stride

	def forward(self, x):
	x = self.conv(x)
	# Perfect Causal Trim for T=15 -> T=30, T=4 -> T=6
	# conv outputs 2T_in + 1. We drop first 2 frames and last 1 frame to yield 2*(T_in - 1)
	# This aligns dependencies perfectly: [X0,X1] -> [X1] -> [X1,X2] -> [X2] ...
	x = x[:, :, 2:-1]
	return x


	class CausalGroupNorm(nn.Module):
	"""GroupNorm that treats Time as part of the Batch dimension to prevent temporal leakage."""
	def __init__(self, num_groups, num_channels):
	super().__init__()
	self.gn = nn.GroupNorm(num_groups, num_channels)

	def forward(self, x):
	B, C, T, H, W = x.shape
	# Move T to batch dimension: (B, C, T, H, W) -> (B, T, C, H, W) -> (B*T, C, H, W)
	x_reshaped = x.transpose(1, 2).contiguous().view(B * T, C, H, W)
	x_norm = self.gn(x_reshaped)
	# Revert back: (B*T, C, H, W) -> (B, T, C, H, W) -> (B, C, T, H, W)
	return x_norm.view(B, T, C, H, W).transpose(1, 2).contiguous()


	class CausalResBlock3d(nn.Module):
	"""3D residual block with group norm and causal temporal convolution."""

	def __init__(self, channels):
	super().__init__()
	self.net = nn.Sequential(
	CausalGroupNorm(min(16, channels), channels),
	nn.SiLU(),
	CausalConv3d(channels, channels, kernel_size=3, padding=1),
	CausalGroupNorm(min(16, channels), channels),
	nn.SiLU(),
	CausalConv3d(channels, channels, kernel_size=3, padding=1),
	)

	def forward(self, x):
	return x + self.net(x)



	class ResBlock3d(nn.Module):
	"""3D residual block with group norm."""

	def __init__(self, channels):
	super().__init__()
	self.net = nn.Sequential(
	nn.GroupNorm(min(16, channels), channels),
	nn.SiLU(),
	nn.Conv3d(channels, channels, kernel_size=3, padding=1),
	nn.GroupNorm(min(16, channels), channels),
	nn.SiLU(),
	nn.Conv3d(channels, channels, kernel_size=3, padding=1),
	)

	def forward(self, x):
	return x + self.net(x)


	class UpsampleBlock3d(nn.Module):
	"""Spatial upsample 2× + channel change + residual blocks."""

	def __init__(self, in_ch, out_ch, n_res=2, spatial_up=True, temporal_up=False, use_conv_transpose=False, causal=False):
	super().__init__()

	layers = []
	if spatial_up:
	scale = (2, 2, 2) if temporal_up else (1, 2, 2)
	if use_conv_transpose:
	layers.append(nn.ConvTranspose3d(in_ch, in_ch, kernel_size=scale, stride=scale))
	else:
	layers.append(nn.Upsample(scale_factor=scale, mode='trilinear', align_corners=False))

	if causal:
	layers.append(CausalConv3d(in_ch, out_ch, kernel_size=3, padding=1))
	for _ in range(n_res):
	layers.append(CausalResBlock3d(out_ch))
	else:
	layers.append(nn.Conv3d(in_ch, out_ch, kernel_size=3, padding=1))
	for _ in range(n_res):
	layers.append(ResBlock3d(out_ch))

	self.net = nn.Sequential(*layers)

	def forward(self, x):
	return self.net(x)


	class LatentAdapterV3(nn.Module):
	"""
	V3 architecture: ~3.37M params. Maps LTX-2 latents → WAN 2.1 latents.
	"""
	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 128
	self.input_conv = nn.Sequential(
	nn.Conv3d(128, 128, kernel_size=3, padding=1),
	ResBlock3d(128),
	ResBlock3d(128),
	)

	# Stage 1: Channel reduce 128 → 64 + spatial 2× upsample
	self.up1 = UpsampleBlock3d(128, 64, n_res=2, spatial_up=True)

	# Stage 2: 64 → 32 + spatial 2× upsample (total 4×)
	self.up2 = UpsampleBlock3d(64, 32, n_res=2, spatial_up=True)

	# Stage 3: 32 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	nn.Conv3d(32, 16, kernel_size=3, padding=1),
	ResBlock3d(16),
	ResBlock3d(16),
	nn.Conv3d(16, 16, kernel_size=3, padding=1),
	)

	def forward(self, z_ltx, target_shape=None):
	z = self.input_conv(z_ltx)
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z)

	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapter(nn.Module):
	"""
	Maps LTX-2 latents → WAN 2.1 latents.

	Input: (B, 128, T_ltx, H_ltx, W_ltx)
	Output: (B, 16, T_wan, H_wan, W_wan)

	Spatial: H_wan = H_ltx * 4, W_wan = W_ltx * 4
	Temporal: handled by F.interpolate at the end

	V4 architecture: ~14.4M params. Wider channels and 4 res blocks per stage.
	"""

	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 256
	self.input_conv = nn.Sequential(
	nn.Conv3d(128, 256, kernel_size=3, padding=1),
	ResBlock3d(256),
	ResBlock3d(256),
	ResBlock3d(256),
	ResBlock3d(256),
	)

	# Stage 1: Channel reduce 256 → 128 + spatial 2× upsample
	self.up1 = UpsampleBlock3d(256, 128, n_res=4, spatial_up=True)

	# Stage 2: 128 → 64 + spatial 2× upsample (total 4×)
	self.up2 = UpsampleBlock3d(128, 64, n_res=4, spatial_up=True)

	# Stage 3: 64 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	nn.Conv3d(64, 32, kernel_size=3, padding=1),
	ResBlock3d(32),
	ResBlock3d(32),
	nn.Conv3d(32, 16, kernel_size=3, padding=1),
	)

	def forward(self, z_ltx, target_shape=None):
	"""
	Args:
	z_ltx: (B, 128, T, H, W) LTX-2 latent
	target_shape: optional (T_target, H_target, W_target) to match exact WAN dimensions
	"""
	z = self.input_conv(z_ltx)
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z)

	# Interpolate to exact target dimensions if provided
	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapterV6(nn.Module):
	"""
	V6 architecture: ~81.2M params. Maps LTX-2 latents → WAN 2.1 latents.
	"""
	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 512
	self.input_conv = nn.Sequential(
	nn.Conv3d(128, 512, kernel_size=3, padding=1),
	ResBlock3d(512),
	ResBlock3d(512),
	ResBlock3d(512),
	ResBlock3d(512),
	)

	# Stage 1: Channel reduce 512 → 256 + spatial 2× upsample
	self.up1 = UpsampleBlock3d(512, 256, n_res=4, spatial_up=True)

	# Stage 2: 256 → 128 + spatial 2× upsample (total 4×)
	self.up2 = UpsampleBlock3d(256, 128, n_res=4, spatial_up=True)

	# Stage 3: 128 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	nn.Conv3d(128, 64, kernel_size=3, padding=1),
	ResBlock3d(64),
	ResBlock3d(64),
	nn.Conv3d(64, 16, kernel_size=3, padding=1),
	)

	def forward(self, z_ltx, target_shape=None):
	z = self.input_conv(z_ltx)
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z)

	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapterV6_3(nn.Module):
	"""
	V6.3 architecture: ~81.8M params. Uses ConvTranspose3d for upsampling instead of trilinear interpolation.
	Maps LTX-2 latents → WAN 2.1 latents.
	"""
	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 512
	self.input_conv = nn.Sequential(
	nn.Conv3d(128, 512, kernel_size=3, padding=1),
	ResBlock3d(512),
	ResBlock3d(512),
	ResBlock3d(512),
	ResBlock3d(512),
	)

	# Stage 1: Channel reduce 512 → 256 + spatial 2× upsample via transposed convolution
	self.up1 = UpsampleBlock3d(512, 256, n_res=4, spatial_up=True, use_conv_transpose=True)

	# Stage 2: 256 → 128 + spatial 2× upsample via transposed convolution
	self.up2 = UpsampleBlock3d(256, 128, n_res=4, spatial_up=True, use_conv_transpose=True)

	# Stage 3: 128 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	nn.Conv3d(128, 64, kernel_size=3, padding=1),
	ResBlock3d(64),
	ResBlock3d(64),
	nn.Conv3d(64, 16, kernel_size=3, padding=1),
	)

	def forward(self, z_ltx, target_shape=None):
	z = self.input_conv(z_ltx)
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z)

	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapterV6_4(nn.Module):
	"""
	V6.4 architecture: ~81.8M params. Uses ConvTranspose3d for upsampling and Causal 3D convolutions
	to entirely eliminate future frame ghosting. Maps LTX-2 latents → WAN 2.1 latents.
	"""
	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 512
	self.input_conv = nn.Sequential(
	CausalConv3d(128, 512, kernel_size=3, padding=1),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	)

	# Stage 1: Channel reduce 512 → 256 + spatial 2× upsample via transposed convolution
	self.up1 = UpsampleBlock3d(512, 256, n_res=4, spatial_up=True, use_conv_transpose=True, causal=True)

	# Stage 2: 256 → 128 + spatial 2× upsample via transposed convolution
	self.up2 = UpsampleBlock3d(256, 128, n_res=4, spatial_up=True, use_conv_transpose=True, causal=True)

	# Stage 3: 128 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	CausalConv3d(128, 64, kernel_size=3, padding=1),
	CausalResBlock3d(64),
	CausalResBlock3d(64),
	CausalConv3d(64, 16, kernel_size=3, padding=1),
	)

	def forward(self, z_ltx, target_shape=None):
	z = self.input_conv(z_ltx)
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z)

	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapterV6_5(nn.Module):
	"""
	V6.5 architecture: ~83M params.
	Uses Causal Temporal Convolutions, learned Temporal ConvTranspose3d upsampling (15->30),
	and an explicit learned 'Option B' anchor projection (T_0) to reconstruct realistic 31-frame outputs.
	"""
	def __init__(self):
	super().__init__()

	# Initial feature extraction: 128 → 512 (T=15)
	self.input_conv = nn.Sequential(
	CausalConv3d(128, 512, kernel_size=3, padding=1),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	)

	# Temporal 2x Upsample: T=15 -> T=30
	self.temporal_up = nn.Sequential(
	CausalConvTranspose1d(512, stride=2),
	CausalResBlock3d(512),
	CausalResBlock3d(512),
	)

	# Stage 1: Channel reduce 512 → 256 + spatial 2× upsample via transposed convolution
	self.up1 = UpsampleBlock3d(512, 256, n_res=4, spatial_up=True, use_conv_transpose=True, causal=True)

	# Stage 2: 256 → 128 + spatial 2× upsample via transposed convolution
	self.up2 = UpsampleBlock3d(256, 128, n_res=4, spatial_up=True, use_conv_transpose=True, causal=True)

	# Stage 3: 128 → 16 refinement (no upsample)
	self.output_block = nn.Sequential(
	CausalConv3d(128, 64, kernel_size=3, padding=1),
	CausalResBlock3d(64),
	CausalResBlock3d(64),
	CausalConv3d(64, 16, kernel_size=3, padding=1),
	)

	# Option B Anchor Projection: LTX T_0 (128ch) -> WAN T_0 (16ch, 4x spatial)
	self.anchor_proj = nn.Sequential(
	nn.ConvTranspose3d(128, 64, kernel_size=(1, 4, 4), stride=(1, 4, 4)),
	nn.Conv3d(64, 64, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
	nn.SiLU(),
	nn.Conv3d(64, 16, kernel_size=(1, 3, 3), padding=(0, 1, 1))
	)

	def forward(self, z_ltx, target_shape=None):
	# 1. Option B Anchor Projection: grab frame 0 and project it independently
	anchor = self.anchor_proj(z_ltx[:, :, :1]) # (B, 16, 1, H4, W4)

	# 2. Main Network (T_ltx -> (T_ltx-1)*2)
	z = self.input_conv(z_ltx)
	z = self.temporal_up(z)
	assert z.shape[2] == (z_ltx.shape[2] - 1) * 2, f"temporal_up output T={z.shape[2]}, expected {(z_ltx.shape[2] - 1) * 2}"
	z = self.up1(z)
	z = self.up2(z)
	z = self.output_block(z) # (B, 16, 30, H4, W4)

	# 3. Concatenate Anchor T=1 + Main T=30 -> T=31
	z = torch.cat([anchor, z], dim=2)

	if target_shape is not None:
	if z.shape[2:] != target_shape:
	z = F.interpolate(z, size=target_shape, mode='trilinear', align_corners=False)

	return z

	def param_count(self):
	return sum(p.numel() for p in self.parameters())


	class LatentAdapterV6_6(LatentAdapterV6_5):
	"""
	V6.6 architecture: Identical to 6.5 (~83M params),
	but the training framework does not truncate the identity anchor slice out of the loss.
	"""
	pass


	if __name__ == "__main__":
	model = LatentAdapter()
	print(f"Parameter count: {model.param_count():,}")
	print(f"Model size (fp32): {model.param_count() * 4 / 1024 / 1024:.1f} MB")

	# Test with 480×704 video (portrait 464x688 → short_side=480)
	# LTX: 32× spatial, 8× temporal → (128, 4, 15, 22) for 25 frames
	z_ltx = torch.randn(1, 128, 4, 15, 22)
	# WAN: 8× spatial, 4× temporal → (16, 7, 60, 88)
	z_wan_target = (7, 60, 88)

	z_out = model(z_ltx, target_shape=z_wan_target)
	print(f"\nPortrait test (480×704, 25 frames):")
	print(f" Input: {z_ltx.shape}")
	print(f" Output: {z_out.shape}")
	print(f" Target: (1, 16, {z_wan_target[0]}, {z_wan_target[1]}, {z_wan_target[2]})")

	# Test with landscape
	z_ltx2 = torch.randn(1, 128, 4, 15, 26)
	z_wan_target2 = (7, 60, 104)
	z_out2 = model(z_ltx2, target_shape=z_wan_target2)
	print(f"\nLandscape test (480×832, 25 frames):")
	print(f" Input: {z_ltx2.shape}")
	print(f" Output: {z_out2.shape}")
	print(f" Target: (1, 16, {z_wan_target2[0]}, {z_wan_target2[1]}, {z_wan_target2[2]})")