File size: 10,922 Bytes

from __future__ import annotations

import math
from typing import Optional

import torch
import torch.nn.functional as F
from torch import nn


def latent_patch_tokens(latents: torch.Tensor, patch_size: int) -> torch.Tensor:
    if latents.ndim != 5:
        raise ValueError("latents must have shape (T,B,C,H,W)")
    if patch_size <= 0:
        raise ValueError("patch_size must be positive")
    T, B, C, H, W = latents.shape
    if H % patch_size != 0 or W % patch_size != 0:
        raise ValueError(f"latent H,W=({H},{W}) must be divisible by patch_size={patch_size}")
    flat = latents.reshape(T * B, C, H, W)
    patches = F.unfold(flat, kernel_size=patch_size, stride=patch_size).transpose(1, 2).contiguous()
    return patches.reshape(T, B, patches.shape[1], C * patch_size * patch_size)


def spatial_pool_tokens(
    tokens: torch.Tensor,
    pool_h: int,
    pool_w: int,
    src_h: int,
    src_w: int,
) -> torch.Tensor:
    """2D adaptive average pool on a flattened (src_h*src_w, D) token grid.
    Preserves 2D spatial layout. Returns (pool_h*pool_w, D)."""
    if tokens.ndim != 2:
        raise ValueError("tokens must have shape (N, D)")
    D = tokens.shape[-1]
    spatial = tokens.reshape(src_h, src_w, D).permute(2, 0, 1).unsqueeze(0)
    pooled = F.adaptive_avg_pool2d(spatial, (pool_h, pool_w))
    return pooled.squeeze(0).permute(1, 2, 0).reshape(-1, D)


class SpatialConv2DMemoryProjector(nn.Module):
    """Project latent maps to DiT hidden tokens while preserving the HxW grid."""

    projects_spatial_latents = True

    def __init__(
        self,
        latent_channels: int,
        dit_hidden_size: int,
        mid_channels: int,
        kernel_size: int = 3,
    ):
        super().__init__()
        kernel_size = int(kernel_size)
        if kernel_size <= 0 or kernel_size % 2 == 0:
            raise ValueError("kernel_size must be a positive odd integer")
        self.latent_channels = int(latent_channels)
        self.dit_hidden_size = int(dit_hidden_size)
        self.mid_channels = int(mid_channels)
        self.kernel_size = kernel_size
        self.out_features = self.dit_hidden_size
        self.proj_in = nn.Conv2d(self.latent_channels, self.mid_channels, kernel_size=1)
        self.proj_spatial = nn.Conv2d(
            self.mid_channels,
            self.dit_hidden_size,
            kernel_size=kernel_size,
            padding=kernel_size // 2,
        )

    def forward(self, latents: torch.Tensor) -> torch.Tensor:
        if latents.ndim != 5:
            raise ValueError("latents must have shape (T,B,C,H,W)")
        T, B, C, H, W = latents.shape
        if C != self.latent_channels:
            raise ValueError(f"expected {self.latent_channels} latent channels, got {C}")
        x = latents.reshape(T * B, C, H, W)
        x = self.proj_spatial(self.proj_in(x))
        x = x.reshape(T, B, self.dit_hidden_size, H, W)
        return x.permute(1, 0, 3, 4, 2).reshape(B, T, H * W, self.dit_hidden_size).contiguous()


class CausalConv3DDynamicCompressor(nn.Module):
    """Dynamic memory compressor: delta preprocessing + causal Conv3D on raw latents.

    Replaces ShortTermLatentCompressor (slot cross-attention).
    - Operates directly on (T, C, H, W) raw latents
    - Delta: inp[0]=latent[0], inp[t]=latent[t]-latent[t-1]
    - Causal padding prepends temporal zeros and right-aligns fixed outputs
    - Zero-padded to max_source_frames for fixed output shape
    - No slot cross-attention, no chunking
    """

    def __init__(
        self,
        latent_channels: int,
        dit_hidden_size: int,
        patch_size: int = 2,
        conv_kernel_t: int = 3,
        conv_stride_t: int = 2,
        max_source_frames: int = 8,
        exclude_latest_local_frames: int = 4,
    ):
        super().__init__()
        self.latent_channels = latent_channels
        self.dit_hidden_size = dit_hidden_size
        self.patch_size = patch_size
        self.conv_kernel_t = conv_kernel_t
        self.conv_stride_t = conv_stride_t
        self.max_source_frames = max_source_frames
        self.exclude_latest_local_frames = int(exclude_latest_local_frames)
        self.causal_pad = self._temporal_left_pad()
        self.conv3d = nn.Conv3d(
            latent_channels, dit_hidden_size,
            kernel_size=(conv_kernel_t, patch_size, patch_size),
            stride=(conv_stride_t, patch_size, patch_size),
            padding=0,
        )
        self.out_norm = nn.LayerNorm(dit_hidden_size)
        self._init_temporal_as_delta()

    def _init_temporal_as_delta(self) -> None:
        with torch.no_grad():
            self.conv3d.weight.zero_()
            k_t, p = self.conv_kernel_t, self.patch_size
            D_out, D_in = self.conv3d.weight.shape[:2]
            scale = 1.0 / (p * p)
            # Delta preprocessing happens in forward. Initialize every output
            # channel to read a patch-averaged current delta, repeating latent
            # channels across the wider DiT hidden dimension.
            for d in range(D_out):
                self.conv3d.weight[d, d % D_in, k_t - 1, :, :] = scale
            if self.conv3d.bias is not None:
                nn.init.zeros_(self.conv3d.bias)

    def _temporal_output_count(self) -> int:
        return math.ceil(self.max_source_frames / self.conv_stride_t)

    def _temporal_left_pad(self) -> int:
        t_out = self._temporal_output_count()
        latest_output_end = (t_out - 1) * self.conv_stride_t + self.conv_kernel_t - 1
        latest_source = self.max_source_frames - 1
        return max(0, latest_output_end - latest_source)

    def _output_time_indices(self, device: torch.device) -> torch.Tensor:
        t_out = self._temporal_output_count()
        return (
            torch.arange(t_out, device=device, dtype=torch.long) * self.conv_stride_t
            + self.conv_kernel_t
            - 1
            - self.causal_pad
        )

    def tokens_per_target(self, H: int, W: int) -> int:
        p = self.patch_size
        T_out = self._temporal_output_count()
        return T_out * (H // p) * (W // p)

    def forward(
        self,
        latents: torch.Tensor,
        frame_indices: torch.Tensor,
        pose: Optional[torch.Tensor],
        target_frame_indices: torch.Tensor,
        source_is_generated: Optional[torch.Tensor] = None,
        exclude_latest_local_frames: Optional[int] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        del pose, source_is_generated
        if latents.ndim != 5:
            raise ValueError("latents must have shape (T_src,B,C,H,W)")
        exclude_latest_local_frames = (
            self.exclude_latest_local_frames
            if exclude_latest_local_frames is None
            else int(exclude_latest_local_frames)
        )
        T_src, B, C, H, W = latents.shape
        p = self.patch_size
        if H % p != 0 or W % p != 0:
            raise ValueError(f"latent H,W=({H},{W}) must be divisible by patch_size={p}")
        if frame_indices.shape != (T_src, B):
            raise ValueError("frame_indices must have shape (T_src,B)")
        if target_frame_indices.ndim == 1:
            target_frame_indices = target_frame_indices[:, None].expand(-1, B)
        if target_frame_indices.ndim != 2 or target_frame_indices.shape[1] != B:
            raise ValueError("target_frame_indices must have shape (T_tgt,B)")

        device = latents.device
        frame_indices = frame_indices.to(device=device)
        target_frame_indices = target_frame_indices.to(device=device)
        T_tgt = target_frame_indices.shape[0]
        n_spatial = (H // p) * (W // p)
        T_out = self._temporal_output_count()
        num_slots = T_out * n_spatial
        output_time_idx = self._output_time_indices(device)
        if T_src == 0:
            out_tokens = latents.new_zeros((B, T_tgt, num_slots, self.dit_hidden_size))
            out_mask = torch.zeros((B, T_tgt, num_slots), device=device, dtype=torch.bool)
            return out_tokens, out_mask

        source_frames = frame_indices.transpose(0, 1).contiguous()
        target_frames = target_frame_indices.transpose(0, 1).contiguous()
        valid = source_frames[:, None, :] < (target_frames[:, :, None] - int(exclude_latest_local_frames))
        valid_flat = valid.reshape(B * T_tgt, T_src)
        source_frames_flat = source_frames[:, None, :].expand(B, T_tgt, T_src).reshape(B * T_tgt, T_src)

        topk = min(int(self.max_source_frames), T_src)
        rank = source_frames_flat.to(dtype=torch.float64).masked_fill(~valid_flat, -float("inf"))
        top = torch.topk(rank, k=topk, dim=1, largest=True, sorted=True)
        selected_idx = top.indices.flip(dims=(1,))
        selected_valid = torch.isfinite(top.values).flip(dims=(1,))
        if topk < self.max_source_frames:
            pad_count = self.max_source_frames - topk
            selected_idx = torch.cat([
                torch.zeros((B * T_tgt, pad_count), device=device, dtype=torch.long),
                selected_idx,
            ], dim=1)
            selected_valid = torch.cat([
                torch.zeros((B * T_tgt, pad_count), device=device, dtype=torch.bool),
                selected_valid,
            ], dim=1)

        selected_idx_clamped = selected_idx.to(device=device, dtype=torch.long).clamp(min=0, max=max(0, T_src - 1))
        has_valid = selected_valid.any(dim=1)
        batch_ids = torch.arange(B, device=device, dtype=torch.long).repeat_interleave(T_tgt)
        latents_by_batch = latents.permute(1, 0, 2, 3, 4).contiguous()
        latents_per_query = latents_by_batch.index_select(0, batch_ids)
        gather_idx = selected_idx_clamped.reshape(B * T_tgt, self.max_source_frames, 1, 1, 1).expand(
            -1, -1, C, H, W
        )
        chunk = torch.gather(latents_per_query, 1, gather_idx)
        chunk = torch.where(
            selected_valid[:, :, None, None, None],
            chunk,
            torch.zeros((), device=device, dtype=latents.dtype),
        )

        inp = chunk.clone()
        inp[:, 1:] = chunk[:, 1:] - chunk[:, :-1]
        x = inp.permute(0, 2, 1, 3, 4)
        x = F.pad(x, (0, 0, 0, 0, self.causal_pad, 0))
        x = self.conv3d(x)
        x = self.out_norm(x.permute(0, 2, 3, 4, 1))
        tokens_flat = x.reshape(B * T_tgt, num_slots, self.dit_hidden_size)
        tokens_flat = torch.where(has_valid[:, None, None], tokens_flat, torch.zeros_like(tokens_flat))
        out_tokens = tokens_flat.reshape(B, T_tgt, num_slots, self.dit_hidden_size)

        clamped_time_idx = output_time_idx.clamp(min=0, max=self.max_source_frames - 1)
        temporal_mask = (
            (output_time_idx >= 0)
            & (output_time_idx < self.max_source_frames)
            & selected_valid.index_select(1, clamped_time_idx)
        )
        out_mask = temporal_mask[:, :, None].expand(B * T_tgt, T_out, n_spatial).reshape(B, T_tgt, num_slots)
        return out_tokens, out_mask