upd

Browse files

Files changed (14) hide show

config.py +96 -11
dataset.py +242 -0
inference.py +0 -5
models/__init__.py +14 -0
models/__pycache__/__init__.cpython-310.pyc +0 -0
models/__pycache__/text_encoder.cpython-310.pyc +0 -0
models/__pycache__/unet3d.cpython-310.pyc +0 -0
models/text_encoder.py +268 -0
models/unet3d.py +961 -0
pipeline.py +416 -0
schedulers/__init__.py +10 -0
schedulers/__pycache__/__init__.cpython-310.pyc +0 -0
schedulers/__pycache__/ddim.cpython-310.pyc +0 -0
schedulers/ddim.py +298 -0

config.py CHANGED Viewed

@@ -1,16 +1,101 @@
-# Example config for Text2Sign model
 class ModelConfig:
-    vocab_size = 30522
-    max_text_length = 77
-    use_clip_text_encoder = False
-    # ... other model hyperparameters ...
 class GenerationConfig:
-    num_inference_steps = 50
-    guidance_scale = 7.5
-    eta = 0.0
-    fps = 8
-    # ... other generation settings ...
-# Add any additional config as needed for your model

+"""
+Configuration for Text-to-Sign Language DDIM Diffusion Model
+"""
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+@dataclass
 class ModelConfig:
+    """Model architecture configuration"""
+    # Image/Video dimensions
+    image_size: int = 64  # Resize GIFs to 64x64
+    num_frames: int = 16  # Number of frames per video
+    in_channels: int = 3  # RGB channels
+    # UNet architecture (increased capacity for better quality)
+    model_channels: int = 96  # Increased from 64 for better quality
+    channel_mult: Tuple[int, ...] = (1, 2, 4)  # Depth levels
+    num_res_blocks: int = 2
+    attention_resolutions: Tuple[int, ...] = (8, 16)
+    num_heads: int = 6  # Increased from 4 for better attention
+    # Transformer settings (DiT-style)
+    use_transformer: bool = True  # Use enhanced DiT-style transformer blocks
+    transformer_depth: int = 2  # Increased from 1 for deeper transformers
+    use_gradient_checkpointing: bool = True  # Enable gradient checkpointing for memory savings
+    # Text encoder
+    use_clip_text_encoder: bool = True  # Default to frozen pretrained CLIP text encoder
+    text_embed_dim: int = 384  # Increased from 256 for richer text embeddings
+    max_text_length: int = 77
+    vocab_size: int = 49408  # CLIP vocab size
+    # Cross attention
+    context_dim: int = 384  # Increased from 256 for better cross-attention
+@dataclass
+class DDIMConfig:
+    """DDIM scheduler configuration"""
+    num_train_timesteps: int = 100
+    num_inference_steps: int = 100
+    beta_start: float = 0.0001
+    beta_end: float = 0.02
+    beta_schedule: str = "linear"  # "linear" or "cosine"
+    clip_sample: bool = True
+    prediction_type: str = "epsilon"  # "epsilon" or "v_prediction"
+@dataclass
+class TrainingConfig:
+    """Training configuration"""
+    # Data
+    data_dir: str = "text2sign/training_data"
+    batch_size: int = 2  # Reduced from 4 for memory
+    num_workers: int = 4
+    # Training
+    num_epochs: int = 150  # Increased for more training
+    learning_rate: float = 5e-5  # Reduced from 1e-4 for fine-tuning stability
+    weight_decay: float = 0.01
+    warmup_steps: int = 500  # Reduced warmup for fine-tuning
+    gradient_accumulation_steps: int = 8  # Effective batch size = 16
+    max_grad_norm: float = 1.0
+    # Mixed precision
+    use_amp: bool = True
+    # Checkpointing
+    checkpoint_dir: str = "text_to_sign/checkpoints"
+    save_every: int = 5  # Save every N epochs
+    log_every: int = 100  # Log every N steps
+    sample_every: int = 1000  # Generate samples every N steps
+    # TensorBoard
+    log_dir: str = "text_to_sign/logs"
+    # Device
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+@dataclass
 class GenerationConfig:
+    """Generation/Inference configuration"""
+    num_inference_steps: int = 50
+    guidance_scale: float = 7.5
+    eta: float = 0.0  # 0 for DDIM, 1 for DDPM
+    output_dir: str = "text_to_sign/generated"
+    fps: int = 8  # Output GIF frame rate
+def get_config():
+    """Get all configurations"""
+    return {
+        "model": ModelConfig(),
+        "ddim": DDIMConfig(),
+        "training": TrainingConfig(),
+        "generation": GenerationConfig(),
+    }

dataset.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+Dataset for loading text-GIF pairs for sign language generation
+"""
+import os
+import glob
+import random
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch.utils.data import Dataset, DataLoader
+from PIL import Image
+import numpy as np
+from torchvision import transforms
+class SignLanguageDataset(Dataset):
+    """Dataset for text-to-sign language video generation"""
+    def __init__(
+        self,
+        data_dir: str,
+        image_size: int = 64,
+        num_frames: int = 16,
+        train: bool = True,
+        train_ratio: float = 0.9,
+    ):
+        """
+        Args:
+            data_dir: Directory containing .gif and .txt files
+            image_size: Size to resize frames to
+            num_frames: Number of frames to sample from each GIF
+            train: Whether this is training set
+            train_ratio: Ratio of data to use for training
+        """
+        self.data_dir = data_dir
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.train = train
+        # Find all pairs
+        self.pairs = self._find_pairs()
+        # Split into train/val
+        random.seed(42)
+        indices = list(range(len(self.pairs)))
+        random.shuffle(indices)
+        split_idx = int(len(indices) * train_ratio)
+        if train:
+            self.indices = indices[:split_idx]
+        else:
+            self.indices = indices[split_idx:]
+        # Image transforms
+        self.transform = transforms.Compose([
+            transforms.Resize((image_size, image_size)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # [-1, 1]
+        ])
+        print(f"Loaded {len(self.indices)} {'training' if train else 'validation'} samples")
+    def _find_pairs(self) -> List[Tuple[str, str]]:
+        """Find all GIF-text pairs in the data directory"""
+        pairs = []
+        # Find all GIF files
+        gif_files = glob.glob(os.path.join(self.data_dir, "*.gif"))
+        for gif_path in gif_files:
+            # Find corresponding text file
+            txt_path = gif_path.replace(".gif", ".txt")
+            if os.path.exists(txt_path):
+                pairs.append((gif_path, txt_path))
+        return pairs
+    def _load_gif(self, gif_path: str) -> torch.Tensor:
+        """Load GIF and sample frames"""
+        try:
+            gif = Image.open(gif_path)
+            # Get all frames
+            frames = []
+            try:
+                while True:
+                    # Convert to RGB
+                    frame = gif.convert("RGB")
+                    frame = self.transform(frame)
+                    frames.append(frame)
+                    gif.seek(gif.tell() + 1)
+            except EOFError:
+                pass
+            if len(frames) == 0:
+                raise ValueError(f"No frames found in {gif_path}")
+            # Sample or pad frames
+            if len(frames) >= self.num_frames:
+                # Uniform sampling
+                indices = np.linspace(0, len(frames) - 1, self.num_frames, dtype=int)
+                frames = [frames[i] for i in indices]
+            else:
+                # Pad by repeating last frame
+                while len(frames) < self.num_frames:
+                    frames.append(frames[-1])
+            # Stack frames: (num_frames, C, H, W)
+            video = torch.stack(frames)
+            return video
+        except Exception as e:
+            print(f"Error loading {gif_path}: {e}")
+            # Return random noise as fallback
+            return torch.randn(self.num_frames, 3, self.image_size, self.image_size)
+    def _load_text(self, txt_path: str) -> str:
+        """Load text from file"""
+        try:
+            with open(txt_path, "r", encoding="utf-8") as f:
+                text = f.read().strip()
+            return text
+        except Exception as e:
+            print(f"Error loading {txt_path}: {e}")
+            return ""
+    def __len__(self) -> int:
+        return len(self.indices)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        real_idx = self.indices[idx]
+        gif_path, txt_path = self.pairs[real_idx]
+        video = self._load_gif(gif_path)  # (T, C, H, W)
+        text = self._load_text(txt_path)
+        return {
+            "video": video,
+            "text": text,
+        }
+class SimpleTokenizer:
+    """Simple tokenizer for text encoding"""
+    def __init__(self, vocab_size: int = 49408, max_length: int = 77):
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        # Simple character-level tokenization with hash
+        self.bos_token_id = 0
+        self.eos_token_id = 1
+        self.pad_token_id = 2
+    def encode(self, text: str) -> torch.Tensor:
+        """Encode text to token IDs"""
+        # Simple hash-based encoding
+        tokens = [self.bos_token_id]
+        for char in text.lower():
+            # Hash character to token ID
+            token_id = (ord(char) % (self.vocab_size - 3)) + 3
+            tokens.append(token_id)
+            if len(tokens) >= self.max_length - 1:
+                break
+        tokens.append(self.eos_token_id)
+        # Pad to max_length
+        while len(tokens) < self.max_length:
+            tokens.append(self.pad_token_id)
+        return torch.tensor(tokens[:self.max_length], dtype=torch.long)
+    def __call__(self, texts: List[str]) -> torch.Tensor:
+        """Batch encode texts"""
+        return torch.stack([self.encode(text) for text in texts])
+def collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor]:
+    """Custom collate function for batching"""
+    tokenizer = SimpleTokenizer()
+    videos = torch.stack([item["video"] for item in batch])
+    texts = [item["text"] for item in batch]
+    tokens = tokenizer(texts)
+    return {
+        "video": videos,  # (B, T, C, H, W)
+        "tokens": tokens,  # (B, max_length)
+        "text": texts,  # List of strings
+    }
+def get_dataloader(
+    data_dir: str,
+    batch_size: int = 4,
+    image_size: int = 64,
+    num_frames: int = 16,
+    num_workers: int = 4,
+    train: bool = True,
+) -> DataLoader:
+    """Create dataloader for training or validation"""
+    dataset = SignLanguageDataset(
+        data_dir=data_dir,
+        image_size=image_size,
+        num_frames=num_frames,
+        train=train,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=train,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        pin_memory=True,
+        drop_last=train,
+    )
+    return dataloader
+if __name__ == "__main__":
+    # Test dataset
+    dataset = SignLanguageDataset(
+        data_dir="text2sign/training_data",
+        image_size=64,
+        num_frames=16,
+        train=True,
+    )
+    print(f"Dataset size: {len(dataset)}")
+    sample = dataset[0]
+    print(f"Video shape: {sample['video'].shape}")
+    print(f"Text: {sample['text']}")

inference.py CHANGED Viewed

@@ -2,11 +2,6 @@ import torch
 from PIL import Image
 import matplotlib.pyplot as plt
 import numpy as np
-import sys
-import os
-# Add model code to path if needed
-sys.path.append(os.path.join(os.path.dirname(__file__), "../text_to_sign"))
 from pipeline import Text2SignPipeline
 def generate_and_save(prompt, checkpoint_path, output_path, device="cuda"):

 from PIL import Image
 import matplotlib.pyplot as plt
 import numpy as np
 from pipeline import Text2SignPipeline
 def generate_and_save(prompt, checkpoint_path, output_path, device="cuda"):

models/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Models package for text-to-sign language generation
+"""
+from .unet3d import UNet3D, create_unet
+from .text_encoder import TextEncoder, FrozenCLIPTextEncoder, create_text_encoder
+__all__ = [
+    "UNet3D",
+    "create_unet",
+    "TextEncoder",
+    "FrozenCLIPTextEncoder",
+    "create_text_encoder",
+]

models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (445 Bytes). View file

models/__pycache__/text_encoder.cpython-310.pyc ADDED Viewed

Binary file (7.47 kB). View file

models/__pycache__/unet3d.cpython-310.pyc ADDED Viewed

Binary file (24.3 kB). View file

models/text_encoder.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""
+Text encoder for conditioning the diffusion model
+Uses a simple transformer architecture
+"""
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding"""
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.pe[:, :x.size(1)]
+class TransformerEncoderLayer(nn.Module):
+    """Single transformer encoder layer"""
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, num_heads, dropout=dropout, batch_first=True
+        )
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Self attention
+        x2, _ = self.self_attn(x, x, x, key_padding_mask=mask)
+        x = x + self.dropout1(x2)
+        x = self.norm1(x)
+        # Feed forward
+        x2 = self.linear2(self.dropout(F.gelu(self.linear1(x))))
+        x = x + self.dropout2(x2)
+        x = self.norm2(x)
+        return x
+class TextEncoder(nn.Module):
+    """
+    Transformer-based text encoder for conditioning
+    Similar to CLIP text encoder but simplified
+    """
+    def __init__(
+        self,
+        vocab_size: int = 49408,
+        max_length: int = 77,
+        embed_dim: int = 512,
+        num_layers: int = 6,
+        num_heads: int = 8,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_length = max_length
+        self.embed_dim = embed_dim
+        # Token embedding
+        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
+        # Positional encoding
+        self.pos_encoding = PositionalEncoding(embed_dim, max_length)
+        # Transformer layers
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(
+                d_model=embed_dim,
+                num_heads=num_heads,
+                dim_feedforward=embed_dim * 4,
+                dropout=dropout,
+            )
+            for _ in range(num_layers)
+        ])
+        # Final layer norm
+        self.final_norm = nn.LayerNorm(embed_dim)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights"""
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+    def forward(
+        self,
+        tokens: torch.Tensor,  # (B, seq_len)
+        return_pooled: bool = False,
+    ) -> torch.Tensor:
+        """
+        Forward pass
+        Args:
+            tokens: Token IDs (B, seq_len)
+            return_pooled: Whether to return pooled output (first token)
+        Returns:
+            Text embeddings (B, seq_len, embed_dim) or (B, embed_dim) if pooled
+        """
+        # Token embedding
+        x = self.token_embedding(tokens)  # (B, seq_len, embed_dim)
+        # Add positional encoding
+        x = self.pos_encoding(x)
+        # Create attention mask for padding (token_id == 2)
+        padding_mask = (tokens == 2)  # pad_token_id = 2
+        # Transformer layers
+        for layer in self.layers:
+            x = layer(x, mask=padding_mask)
+        # Final norm
+        x = self.final_norm(x)
+        if return_pooled:
+            # Return first token embedding (like [CLS])
+            return x[:, 0]
+        return x
+class FrozenCLIPTextEncoder(nn.Module):
+    """
+    Wrapper for using pretrained CLIP text encoder (if available)
+    Falls back to custom TextEncoder if CLIP is not available
+    """
+    def __init__(
+        self,
+        embed_dim: int = 512,
+        max_length: int = 77,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_length = max_length
+        try:
+            from transformers import CLIPTextModel, CLIPTokenizer
+            self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+            self.model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+            # Freeze the model
+            for param in self.model.parameters():
+                param.requires_grad = False
+            # Project to target dim if needed
+            clip_dim = self.model.config.hidden_size
+            if clip_dim != embed_dim:
+                self.proj = nn.Linear(clip_dim, embed_dim)
+            else:
+                self.proj = nn.Identity()
+            self.use_clip = True
+            print("Using pretrained CLIP text encoder")
+        except Exception as e:
+            print(f"CLIP not available ({e}), using custom text encoder")
+            self.model = TextEncoder(
+                embed_dim=embed_dim,
+                max_length=max_length,
+            )
+            self.proj = nn.Identity()
+            self.use_clip = False
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        text: Optional[list] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass
+        Args:
+            tokens: Pre-tokenized token IDs (B, seq_len) - used if not using CLIP
+            text: List of text strings - used if using CLIP
+        Returns:
+            Text embeddings (B, seq_len, embed_dim)
+        """
+        if self.use_clip and text is not None:
+            # Tokenize with CLIP tokenizer
+            inputs = self.tokenizer(
+                text,
+                padding="max_length",
+                max_length=self.max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            inputs = {k: v.to(next(self.model.parameters()).device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                hidden_states = outputs.last_hidden_state
+            return self.proj(hidden_states)
+        else:
+            return self.proj(self.model(tokens))
+def create_text_encoder(config, use_clip: bool = True):
+    """Create text encoder from config (default: pretrained CLIP)"""
+    if use_clip:
+        return FrozenCLIPTextEncoder(
+            embed_dim=config.text_embed_dim,
+            max_length=config.max_text_length,
+        )
+    else:
+        return TextEncoder(
+            vocab_size=config.vocab_size,
+            max_length=config.max_text_length,
+            embed_dim=config.text_embed_dim,
+        )
+if __name__ == "__main__":
+    # Test the encoder
+    encoder = TextEncoder(
+        vocab_size=49408,
+        max_length=77,
+        embed_dim=512,
+        num_layers=6,
+        num_heads=8,
+    )
+    # Test input
+    tokens = torch.randint(0, 49408, (2, 77))
+    # Forward pass
+    output = encoder(tokens)
+    print(f"Input shape: {tokens.shape}")
+    print(f"Output shape: {output.shape}")
+    print(f"Parameters: {sum(p.numel() for p in encoder.parameters()):,}")

models/unet3d.py ADDED Viewed

	@@ -0,0 +1,961 @@

+"""
+3D UNet architecture for video diffusion with text conditioning
+Enhanced with Transformer (DiT-style) blocks for better temporal modeling
+Based on:
+- Diffusion Transformers (DiT) - Peebles & Xie 2023
+- Video diffusion models with temporal attention
+"""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+def get_timestep_embedding(timesteps: torch.Tensor, embedding_dim: int) -> torch.Tensor:
+    """
+    Create sinusoidal timestep embeddings.
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) * -emb)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = F.pad(emb, (0, 1), mode='constant')
+    return emb
+def get_3d_sincos_pos_embed(embed_dim: int, grid_size: Tuple[int, int, int]) -> torch.Tensor:
+    """
+    Generate 3D sinusoidal positional embeddings for video (T, H, W).
+    """
+    t, h, w = grid_size
+    grid_t = torch.arange(t, dtype=torch.float32)
+    grid_h = torch.arange(h, dtype=torch.float32)
+    grid_w = torch.arange(w, dtype=torch.float32)
+    grid = torch.meshgrid(grid_t, grid_h, grid_w, indexing='ij')
+    grid = torch.stack(grid, dim=0)  # (3, T, H, W)
+    grid = grid.reshape(3, -1).T  # (T*H*W, 3)
+    # Split embedding dim across 3 dimensions
+    dim_t = embed_dim // 3
+    dim_h = embed_dim // 3
+    dim_w = embed_dim - dim_t - dim_h
+    def get_1d_sincos(positions, dim):
+        omega = torch.arange(dim // 2, dtype=torch.float32)
+        omega = 1.0 / (10000 ** (omega / (dim // 2)))
+        out = positions[:, None] * omega[None, :]
+        return torch.cat([torch.sin(out), torch.cos(out)], dim=1)
+    emb_t = get_1d_sincos(grid[:, 0], dim_t)
+    emb_h = get_1d_sincos(grid[:, 1], dim_h)
+    emb_w = get_1d_sincos(grid[:, 2], dim_w)
+    return torch.cat([emb_t, emb_h, emb_w], dim=1)  # (T*H*W, embed_dim)
+class GroupNorm32(nn.GroupNorm):
+    """GroupNorm with float32 computation for stability"""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization (more efficient than LayerNorm)"""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + self.eps)
+        return x / rms * self.weight
+class AdaLayerNorm(nn.Module):
+    """Adaptive Layer Normalization conditioned on timestep (DiT-style)"""
+    def __init__(self, dim: int, time_embed_dim: int):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False)
+        self.proj = nn.Linear(time_embed_dim, dim * 2)
+    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
+        # t_emb: (B, time_embed_dim)
+        scale_shift = self.proj(t_emb)
+        scale, shift = scale_shift.chunk(2, dim=-1)
+        # Handle different input shapes
+        if x.dim() == 3:  # (B, N, C)
+            scale = scale.unsqueeze(1)
+            shift = shift.unsqueeze(1)
+        elif x.dim() == 5:  # (B, C, T, H, W)
+            scale = scale[:, :, None, None, None]
+            shift = shift[:, :, None, None, None]
+        return self.norm(x) * (1 + scale) + shift
+class AdaLayerNormZero(nn.Module):
+    """Adaptive Layer Normalization with zero-init (DiT-style)"""
+    def __init__(self, dim: int, time_embed_dim: int):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False)
+        self.proj = nn.Linear(time_embed_dim, dim * 6)  # scale, shift, gate for both attn and ff
+        nn.init.zeros_(self.proj.weight)
+        nn.init.zeros_(self.proj.bias)
+    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        params = self.proj(t_emb)
+        return self.norm(x), params.chunk(6, dim=-1)
+class Upsample3D(nn.Module):
+    """3D Upsampling with convolution"""
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv3d(channels, channels, 3, padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.interpolate(x, scale_factor=(1, 2, 2), mode='nearest')
+        return self.conv(x)
+class Downsample3D(nn.Module):
+    """3D Downsampling with convolution"""
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv3d(channels, channels, 3, stride=(1, 2, 2), padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(x)
+class ResBlock3D(nn.Module):
+    """3D Residual block with time and context conditioning"""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        time_emb_dim: int,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.in_layers = nn.Sequential(
+            GroupNorm32(32, in_channels),
+            nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, 3, padding=1),
+        )
+        self.time_emb_proj = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(time_emb_dim, out_channels),
+        )
+        self.out_layers = nn.Sequential(
+            GroupNorm32(32, out_channels),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_channels, out_channels, 3, padding=1),
+        )
+        if in_channels != out_channels:
+            self.skip_connection = nn.Conv3d(in_channels, out_channels, 1)
+        else:
+            self.skip_connection = nn.Identity()
+    def forward(
+        self,
+        x: torch.Tensor,
+        time_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        h = self.in_layers(x)
+        # Add time embedding
+        time_emb = self.time_emb_proj(time_emb)
+        h = h + time_emb[:, :, None, None, None]
+        h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class SpatialAttention(nn.Module):
+    """Self-attention over spatial dimensions"""
+    def __init__(self, channels: int, num_heads: int = 8):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = channels // num_heads
+        self.norm = GroupNorm32(32, channels)
+        self.qkv = nn.Conv1d(channels, channels * 3, 1)
+        self.proj = nn.Conv1d(channels, channels, 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, c, t, h, w = x.shape
+        # Reshape to (B*T, C, H*W)
+        x_flat = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h * w)
+        # Normalize
+        x_norm = self.norm(x_flat.view(b * t, c, h, w)).view(b * t, c, h * w)
+        # QKV projection
+        qkv = self.qkv(x_norm)
+        q, k, v = qkv.chunk(3, dim=1)
+        # Reshape for multi-head attention
+        q = q.view(b * t, self.num_heads, self.head_dim, h * w).permute(0, 1, 3, 2)
+        k = k.view(b * t, self.num_heads, self.head_dim, h * w).permute(0, 1, 3, 2)
+        v = v.view(b * t, self.num_heads, self.head_dim, h * w).permute(0, 1, 3, 2)
+        # Attention
+        scale = self.head_dim ** -0.5
+        attn = torch.matmul(q, k.transpose(-2, -1)) * scale
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 1, 3, 2).reshape(b * t, c, h * w)
+        out = self.proj(out)
+        out = out.view(b, t, c, h, w).permute(0, 2, 1, 3, 4)
+        return x + out
+class CrossAttention(nn.Module):
+    """Cross-attention for text conditioning"""
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim: int,
+        num_heads: int = 8,
+        head_dim: int = 64,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        inner_dim = head_dim * num_heads
+        self.norm = GroupNorm32(32, query_dim)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim),
+            nn.Dropout(0.1),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+    ) -> torch.Tensor:
+        b, c, t, h, w = x.shape
+        # Reshape to (B, T*H*W, C)
+        x_flat = x.permute(0, 2, 3, 4, 1).reshape(b, t * h * w, c)
+        # Normalize
+        x_norm = self.norm(x.view(b, c, -1)).permute(0, 2, 1)
+        # QKV
+        q = self.to_q(x_norm)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        # Reshape for multi-head
+        q = q.view(b, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = k.view(b, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = v.view(b, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        # Attention
+        scale = self.head_dim ** -0.5
+        attn = torch.matmul(q, k.transpose(-2, -1)) * scale
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3).reshape(b, t * h * w, -1)
+        out = self.to_out(out)
+        out = out.view(b, t, h, w, c).permute(0, 4, 1, 2, 3)
+        return x + out
+class TemporalAttention(nn.Module):
+    """Self-attention over temporal dimension"""
+    def __init__(self, channels: int, num_heads: int = 8):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = channels // num_heads
+        self.norm = GroupNorm32(32, channels)
+        self.qkv = nn.Linear(channels, channels * 3)
+        self.proj = nn.Linear(channels, channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, c, t, h, w = x.shape
+        # Reshape to (B*H*W, T, C)
+        x_flat = x.permute(0, 3, 4, 2, 1).reshape(b * h * w, t, c)
+        # Normalize
+        x_norm = self.norm(x.view(b, c, -1)).view(b, c, t, h, w)
+        x_norm = x_norm.permute(0, 3, 4, 2, 1).reshape(b * h * w, t, c)
+        # QKV
+        qkv = self.qkv(x_norm)
+        q, k, v = qkv.chunk(3, dim=-1)
+        # Reshape for multi-head
+        q = q.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = k.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = v.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        # Attention
+        scale = self.head_dim ** -0.5
+        attn = torch.matmul(q, k.transpose(-2, -1)) * scale
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3).reshape(b * h * w, t, c)
+        out = self.proj(out)
+        out = out.view(b, h, w, t, c).permute(0, 4, 3, 1, 2)
+        return x + out
+# ============================================================================
+# Transformer Components (DiT-style)
+# ============================================================================
+class MultiHeadAttention(nn.Module):
+    """
+    Multi-head attention with optional flash attention and rotary embeddings.
+    Supports both self-attention and cross-attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        is_cross_attention: bool = False,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.is_cross_attention = is_cross_attention
+        if is_cross_attention:
+            self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+            self.to_kv = nn.Linear(context_dim or dim, dim * 2, bias=qkv_bias)
+        else:
+            self.to_qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        if self.is_cross_attention and context is not None:
+            q = self.to_q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+            kv = self.to_kv(context).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            k, v = kv[0], kv[1]
+        else:
+            qkv = self.to_qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        # Scaled dot-product attention
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        out = self.proj(out)
+        out = self.proj_drop(out)
+        return out
+class FeedForward(nn.Module):
+    """Feed-forward network with GELU activation"""
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: Optional[int] = None,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        hidden_dim = hidden_dim or dim * 4
+        self.net = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class DiTBlock(nn.Module):
+    """
+    Diffusion Transformer Block (DiT-style).
+    Uses adaptive layer norm for timestep conditioning.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        time_embed_dim: int,
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # Self-attention with adaptive norm
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False)
+        self.attn = MultiHeadAttention(dim, num_heads, attn_drop=dropout, proj_drop=dropout)
+        # Cross-attention for text conditioning
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False)
+        self.cross_attn = MultiHeadAttention(
+            dim, num_heads,
+            attn_drop=dropout,
+            proj_drop=dropout,
+            is_cross_attention=True,
+            context_dim=context_dim,
+        )
+        # Feed-forward with adaptive norm
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=False)
+        self.ff = FeedForward(dim, int(dim * mlp_ratio), dropout)
+        # Adaptive parameters (DiT-style)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, dim * 9),  # 3 params each for 3 blocks
+        )
+        nn.init.zeros_(self.adaLN_modulation[-1].weight)
+        nn.init.zeros_(self.adaLN_modulation[-1].bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        t_emb: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Get adaptive parameters
+        params = self.adaLN_modulation(t_emb)
+        (
+            scale1, shift1, gate1,
+            scale2, shift2, gate2,
+            scale3, shift3, gate3,
+        ) = params.unsqueeze(1).chunk(9, dim=-1)
+        # Self-attention
+        x_norm = self.norm1(x) * (1 + scale1) + shift1
+        x = x + gate1 * self.attn(x_norm)
+        # Cross-attention
+        if context is not None:
+            x_norm = self.norm2(x) * (1 + scale2) + shift2
+            x = x + gate2 * self.cross_attn(x_norm, context)
+        # Feed-forward
+        x_norm = self.norm3(x) * (1 + scale3) + shift3
+        x = x + gate3 * self.ff(x_norm)
+        return x
+class TemporalTransformerBlock(nn.Module):
+    """
+    Transformer block specifically for temporal attention.
+    Processes video frames attending to other frames.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False)
+        self.attn = MultiHeadAttention(dim, num_heads, attn_drop=dropout, proj_drop=dropout)
+        # Adaptive parameters
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, dim * 3),
+        )
+        nn.init.zeros_(self.adaLN_modulation[-1].weight)
+        nn.init.zeros_(self.adaLN_modulation[-1].bias)
+    def forward(self, x: torch.Tensor, t_emb: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (B, T, C) temporal sequence
+            t_emb: (B, time_embed_dim) timestep embedding
+        """
+        params = self.adaLN_modulation(t_emb)
+        scale, shift, gate = params.unsqueeze(1).chunk(3, dim=-1)
+        x_norm = self.norm(x) * (1 + scale) + shift
+        x = x + gate * self.attn(x_norm)
+        return x
+class SpatioTemporalTransformer(nn.Module):
+    """
+    Combined spatial and temporal transformer for video understanding.
+    First applies spatial attention within each frame, then temporal attention across frames.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        time_embed_dim: int,
+        context_dim: int,
+        depth: int = 2,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.spatial_blocks = nn.ModuleList([
+            DiTBlock(dim, num_heads, time_embed_dim, dropout=dropout, context_dim=context_dim)
+            for _ in range(depth)
+        ])
+        self.temporal_blocks = nn.ModuleList([
+            TemporalTransformerBlock(dim, num_heads, time_embed_dim, dropout)
+            for _ in range(depth)
+        ])
+    def forward(
+        self,
+        x: torch.Tensor,  # (B, C, T, H, W)
+        t_emb: torch.Tensor,  # (B, time_embed_dim)
+        context: torch.Tensor,  # (B, seq_len, context_dim)
+    ) -> torch.Tensor:
+        B, C, T, H, W = x.shape
+        # Spatial attention: process each frame
+        # Reshape to (B*T, H*W, C)
+        x_spatial = rearrange(x, 'b c t h w -> (b t) (h w) c')
+        t_emb_spatial = repeat(t_emb, 'b d -> (b t) d', t=T)
+        context_spatial = repeat(context, 'b n d -> (b t) n d', t=T)
+        for block in self.spatial_blocks:
+            x_spatial = block(x_spatial, t_emb_spatial, context_spatial)
+        # Reshape back: (B, T, H*W, C)
+        x_spatial = rearrange(x_spatial, '(b t) n c -> b t n c', b=B, t=T)
+        # Temporal attention: process each spatial location
+        # Reshape to (B*H*W, T, C)
+        x_temporal = rearrange(x_spatial, 'b t n c -> (b n) t c', n=H*W)
+        t_emb_temporal = repeat(t_emb, 'b d -> (b n) d', n=H*W)
+        for block in self.temporal_blocks:
+            x_temporal = block(x_temporal, t_emb_temporal)
+        # Reshape back to (B, C, T, H, W)
+        x_out = rearrange(x_temporal, '(b h w) t c -> b c t h w', b=B, h=H, w=W)
+        return x_out
+class TransformerBlock3D(nn.Module):
+    """
+    Enhanced Transformer block with spatial, temporal, and cross attention.
+    Uses DiT-style adaptive layer norm for better timestep conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        context_dim: int,
+        time_embed_dim: int,
+        num_heads: int = 8,
+        transformer_depth: int = 1,
+        use_spatio_temporal: bool = True,
+    ):
+        super().__init__()
+        self.use_spatio_temporal = use_spatio_temporal
+        if use_spatio_temporal:
+            # Use the new SpatioTemporalTransformer
+            self.transformer = SpatioTemporalTransformer(
+                dim=channels,
+                num_heads=num_heads,
+                time_embed_dim=time_embed_dim,
+                context_dim=context_dim,
+                depth=transformer_depth,
+            )
+        else:
+            # Fallback to simpler attention
+            self.spatial_attn = SpatialAttention(channels, num_heads)
+            self.temporal_attn = TemporalAttention(channels, num_heads)
+            self.cross_attn = CrossAttention(
+                query_dim=channels,
+                context_dim=context_dim,
+                num_heads=num_heads,
+            )
+        # Feed-forward (used in both cases)
+        self.ff = nn.Sequential(
+            GroupNorm32(32, channels),
+            nn.Conv3d(channels, channels * 4, 1),
+            nn.GELU(),
+            nn.Conv3d(channels * 4, channels, 1),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        t_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.use_spatio_temporal and t_emb is not None:
+            x = self.transformer(x, t_emb, context)
+        else:
+            x = self.spatial_attn(x)
+            x = self.temporal_attn(x)
+            x = self.cross_attn(x, context)
+        x = x + self.ff(x)
+        return x
+class TemporalAttention(nn.Module):
+    """Self-attention over temporal dimension (legacy, for backward compatibility)"""
+    def __init__(self, channels: int, num_heads: int = 8):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = channels // num_heads
+        self.norm = GroupNorm32(32, channels)
+        self.qkv = nn.Linear(channels, channels * 3)
+        self.proj = nn.Linear(channels, channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, c, t, h, w = x.shape
+        # Reshape to (B*H*W, T, C)
+        x_flat = x.permute(0, 3, 4, 2, 1).reshape(b * h * w, t, c)
+        # Normalize
+        x_norm = self.norm(x.view(b, c, -1)).view(b, c, t, h, w)
+        x_norm = x_norm.permute(0, 3, 4, 2, 1).reshape(b * h * w, t, c)
+        # QKV
+        qkv = self.qkv(x_norm)
+        q, k, v = qkv.chunk(3, dim=-1)
+        # Reshape for multi-head
+        q = q.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        k = k.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        v = v.view(b * h * w, t, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        # Attention
+        scale = self.head_dim ** -0.5
+        attn = torch.matmul(q, k.transpose(-2, -1)) * scale
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3).reshape(b * h * w, t, c)
+        out = self.proj(out)
+        out = out.view(b, h, w, t, c).permute(0, 4, 3, 1, 2)
+        return x + out
+class UNet3D(nn.Module):
+    """
+    3D UNet for video diffusion with text conditioning.
+    Enhanced with DiT-style transformer blocks for better temporal modeling.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        model_channels: int = 128,
+        out_channels: int = 3,
+        num_res_blocks: int = 2,
+        attention_resolutions: Tuple[int, ...] = (8, 16),
+        channel_mult: Tuple[int, ...] = (1, 2, 4, 8),
+        num_heads: int = 8,
+        context_dim: int = 512,
+        dropout: float = 0.1,
+        use_transformer: bool = True,  # Use enhanced transformer blocks
+        transformer_depth: int = 1,  # Depth of transformer blocks
+        use_gradient_checkpointing: bool = False,  # Enable gradient checkpointing for memory
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.channel_mult = channel_mult
+        self.num_heads = num_heads
+        self.use_transformer = use_transformer
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        time_embed_dim = model_channels * 4
+        self.time_embed_dim = time_embed_dim
+        # Time embedding
+        self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        # Input convolution
+        self.input_blocks = nn.ModuleList([
+            nn.Conv3d(in_channels, model_channels, 3, padding=1)
+        ])
+        # Downsampling
+        ch = model_channels
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock3D(ch, mult * model_channels, time_embed_dim, dropout)
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    layers.append(
+                        TransformerBlock3D(
+                            channels=ch,
+                            context_dim=context_dim,
+                            time_embed_dim=time_embed_dim,
+                            num_heads=num_heads,
+                            transformer_depth=transformer_depth,
+                            use_spatio_temporal=use_transformer,
+                        )
+                    )
+                self.input_blocks.append(nn.ModuleList(layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                self.input_blocks.append(nn.ModuleList([Downsample3D(ch)]))
+                input_block_chans.append(ch)
+                ds *= 2
+        # Middle
+        self.middle_block = nn.ModuleList([
+            ResBlock3D(ch, ch, time_embed_dim, dropout),
+            TransformerBlock3D(
+                channels=ch,
+                context_dim=context_dim,
+                time_embed_dim=time_embed_dim,
+                num_heads=num_heads,
+                transformer_depth=transformer_depth,
+                use_spatio_temporal=use_transformer,
+            ),
+            ResBlock3D(ch, ch, time_embed_dim, dropout),
+        ])
+        # Upsampling
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock3D(ch + ich, mult * model_channels, time_embed_dim, dropout)
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    layers.append(
+                        TransformerBlock3D(
+                            channels=ch,
+                            context_dim=context_dim,
+                            time_embed_dim=time_embed_dim,
+                            num_heads=num_heads,
+                            transformer_depth=transformer_depth,
+                            use_spatio_temporal=use_transformer,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    layers.append(Upsample3D(ch))
+                    ds //= 2
+                self.output_blocks.append(nn.ModuleList(layers))
+        # Output
+        self.out = nn.Sequential(
+            GroupNorm32(32, ch),
+            nn.SiLU(),
+            nn.Conv3d(ch, out_channels, 3, padding=1),
+        )
+    def _checkpoint_forward(self, layer, h, t_emb, context=None):
+        """Helper for gradient checkpointing"""
+        if isinstance(layer, ResBlock3D):
+            return layer(h, t_emb)
+        elif isinstance(layer, TransformerBlock3D):
+            return layer(h, context, t_emb)
+        elif isinstance(layer, (Downsample3D, Upsample3D)):
+            return layer(h)
+        return h
+    def forward(
+        self,
+        x: torch.Tensor,  # (B, C, T, H, W)
+        timesteps: torch.Tensor,  # (B,)
+        context: torch.Tensor,  # (B, seq_len, context_dim)
+    ) -> torch.Tensor:
+        """
+        Forward pass
+        Args:
+            x: Noisy video tensor (B, C, T, H, W)
+            timesteps: Diffusion timesteps (B,)
+            context: Text embeddings (B, seq_len, context_dim)
+        Returns:
+            Predicted noise (B, C, T, H, W)
+        """
+        from torch.utils.checkpoint import checkpoint
+        # Time embedding
+        t_emb = get_timestep_embedding(timesteps, self.model_channels)
+        t_emb = self.time_embed(t_emb)
+        # Downsampling path
+        hs = []
+        h = x
+        for module in self.input_blocks:
+            if isinstance(module, nn.Conv3d):
+                h = module(h)
+            elif isinstance(module, nn.ModuleList):
+                for layer in module:
+                    if self.use_gradient_checkpointing and self.training:
+                        h = checkpoint(self._checkpoint_forward, layer, h, t_emb, context, use_reentrant=False)
+                    else:
+                        h = self._checkpoint_forward(layer, h, t_emb, context)
+            hs.append(h)
+        # Middle
+        for layer in self.middle_block:
+            if self.use_gradient_checkpointing and self.training:
+                h = checkpoint(self._checkpoint_forward, layer, h, t_emb, context, use_reentrant=False)
+            else:
+                h = self._checkpoint_forward(layer, h, t_emb, context)
+        # Upsampling path
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            for layer in module:
+                if self.use_gradient_checkpointing and self.training:
+                    h = checkpoint(self._checkpoint_forward, layer, h, t_emb, context, use_reentrant=False)
+                else:
+                    h = self._checkpoint_forward(layer, h, t_emb, context)
+        return self.out(h)
+def create_unet(config) -> UNet3D:
+    """Create UNet model from config"""
+    return UNet3D(
+        in_channels=config.in_channels,
+        model_channels=config.model_channels,
+        out_channels=config.in_channels,
+        num_res_blocks=config.num_res_blocks,
+        attention_resolutions=config.attention_resolutions,
+        channel_mult=config.channel_mult,
+        num_heads=config.num_heads,
+        context_dim=config.context_dim,
+        use_transformer=getattr(config, 'use_transformer', True),
+        transformer_depth=getattr(config, 'transformer_depth', 1),
+        use_gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', False),
+    )
+if __name__ == "__main__":
+    # Test the enhanced model with transformer blocks
+    print("Testing UNet3D with DiT-style Transformer blocks...")
+    model = UNet3D(
+        in_channels=3,
+        model_channels=64,
+        channel_mult=(1, 2, 4),
+        attention_resolutions=(8, 16),
+        num_heads=4,
+        context_dim=256,
+        use_transformer=True,
+        transformer_depth=1,
+    )
+    # Test input
+    batch_size = 2
+    x = torch.randn(batch_size, 3, 16, 64, 64)  # (B, C, T, H, W)
+    t = torch.randint(0, 1000, (batch_size,))
+    context = torch.randn(batch_size, 77, 256)  # (B, seq_len, context_dim)
+    # Forward pass
+    out = model(x, t, context)
+    print(f"Input shape: {x.shape}")
+    print(f"Output shape: {out.shape}")
+    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Test backward pass
+    loss = out.sum()
+    loss.backward()
+    print("Backward pass successful!")
+    # Test without transformer (legacy mode)
+    print("\nTesting UNet3D without transformer (legacy mode)...")
+    model_legacy = UNet3D(
+        in_channels=3,
+        model_channels=64,
+        channel_mult=(1, 2, 4),
+        attention_resolutions=(8, 16),
+        num_heads=4,
+        context_dim=256,
+        use_transformer=False,
+    )
+    out_legacy = model_legacy(x, t, context)
+    print(f"Legacy output shape: {out_legacy.shape}")
+    print(f"Legacy parameters: {sum(p.numel() for p in model_legacy.parameters()):,}")

pipeline.py ADDED Viewed

	@@ -0,0 +1,416 @@

+"""
+Pipeline for text-to-sign language GIF generation
+End-to-end inference with a trained model
+"""
+import os
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from config import ModelConfig, DDIMConfig, GenerationConfig
+from models import UNet3D, TextEncoder, create_text_encoder
+from schedulers import DDIMScheduler
+from dataset import SimpleTokenizer
+class Text2SignPipeline:
+    """
+    End-to-end pipeline for text-to-sign language GIF generation
+    """
+    def __init__(
+        self,
+        model: UNet3D,
+        text_encoder: TextEncoder,
+        scheduler: DDIMScheduler,
+        model_config: ModelConfig,
+        generation_config: GenerationConfig,
+        device: Union[str, torch.device] = "cuda",
+    ):
+        self.model = model.to(device)
+        self.text_encoder = text_encoder.to(device)
+        self.scheduler = scheduler
+        self.model_config = model_config
+        self.generation_config = generation_config
+        self.device = device
+        self.use_clip_text_encoder = getattr(model_config, "use_clip_text_encoder", False) or getattr(text_encoder, "use_clip", False)
+        # Move scheduler tensors to device
+        self._move_scheduler_to_device()
+        # Tokenizer
+        self.tokenizer = None if self.use_clip_text_encoder else SimpleTokenizer(
+            vocab_size=model_config.vocab_size,
+            max_length=model_config.max_text_length,
+        )
+        # Set models to eval mode
+        self.model.eval()
+        self.text_encoder.eval()
+    def _move_scheduler_to_device(self):
+        """Move scheduler tensors to device"""
+        self.scheduler.betas = self.scheduler.betas.to(self.device)
+        self.scheduler.alphas = self.scheduler.alphas.to(self.device)
+        self.scheduler.alphas_cumprod = self.scheduler.alphas_cumprod.to(self.device)
+        self.scheduler.alphas_cumprod_prev = self.scheduler.alphas_cumprod_prev.to(self.device)
+        self.scheduler.sqrt_alphas_cumprod = self.scheduler.sqrt_alphas_cumprod.to(self.device)
+        self.scheduler.sqrt_one_minus_alphas_cumprod = self.scheduler.sqrt_one_minus_alphas_cumprod.to(self.device)
+    @classmethod
+    def from_pretrained(
+        cls,
+        checkpoint_path: str,
+        device: Union[str, torch.device] = "cuda",
+    ) -> "Text2SignPipeline":
+        """
+        Load pipeline from a saved checkpoint
+        Args:
+            checkpoint_path: Path to the checkpoint file
+            device: Device to load models on
+        Returns:
+            Text2SignPipeline instance
+        """
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        # Get configs from checkpoint
+        model_config = checkpoint.get("model_config", ModelConfig())
+        ddim_config = checkpoint.get("ddim_config", DDIMConfig())
+        generation_config = GenerationConfig()
+        # Handle dataclass or dict
+        if isinstance(model_config, dict):
+            model_config = ModelConfig(**model_config)
+        if isinstance(ddim_config, dict):
+            ddim_config = DDIMConfig(**ddim_config)
+        # Detect actual transformer_depth from model weights (config may be wrong)
+        state_dict = checkpoint["model_state_dict"]
+        actual_transformer_depth = 1
+        for key in state_dict.keys():
+            if 'spatial_blocks.' in key:
+                idx = int(key.split('spatial_blocks.')[1].split('.')[0])
+                actual_transformer_depth = max(actual_transformer_depth, idx + 1)
+        config_depth = getattr(model_config, 'transformer_depth', 1)
+        if config_depth != actual_transformer_depth:
+            print(f"  Note: Config says transformer_depth={config_depth}, but weights have depth={actual_transformer_depth}")
+            print(f"  Using actual depth from weights: {actual_transformer_depth}")
+        # Create models with all transformer parameters from config
+        model = UNet3D(
+            in_channels=model_config.in_channels,
+            model_channels=model_config.model_channels,
+            out_channels=model_config.in_channels,
+            num_res_blocks=model_config.num_res_blocks,
+            attention_resolutions=model_config.attention_resolutions,
+            channel_mult=model_config.channel_mult,
+            num_heads=model_config.num_heads,
+            context_dim=model_config.context_dim,
+            use_transformer=getattr(model_config, 'use_transformer', True),
+            transformer_depth=actual_transformer_depth,  # Use detected depth from weights
+            use_gradient_checkpointing=getattr(model_config, 'use_gradient_checkpointing', False),
+        )
+        # Detect text encoder type from weights
+        text_encoder_state_dict = checkpoint["text_encoder_state_dict"]
+        use_clip = getattr(model_config, "use_clip_text_encoder", False)
+        # Check if weights match CLIP structure
+        has_clip_keys = any("model.text_model" in k for k in text_encoder_state_dict.keys())
+        has_custom_keys = any("token_embedding.weight" in k and "model.text_model" not in k for k in text_encoder_state_dict.keys())
+        if use_clip and not has_clip_keys and has_custom_keys:
+            print("  Note: Config says use_clip_text_encoder=True, but weights appear to be custom TextEncoder")
+            print("  Forcing use_clip=False")
+            use_clip = False
+            # Update config to match
+            model_config.use_clip_text_encoder = False
+        text_encoder = create_text_encoder(
+            model_config,
+            use_clip=use_clip,
+        )
+        scheduler = DDIMScheduler(
+            num_train_timesteps=ddim_config.num_train_timesteps,
+            beta_start=ddim_config.beta_start,
+            beta_end=ddim_config.beta_end,
+            beta_schedule=ddim_config.beta_schedule,
+            clip_sample=ddim_config.clip_sample,
+            prediction_type=ddim_config.prediction_type,
+        )
+        # Load weights
+        model.load_state_dict(checkpoint["model_state_dict"])
+        text_encoder.load_state_dict(checkpoint["text_encoder_state_dict"])
+        return cls(
+            model=model,
+            text_encoder=text_encoder,
+            scheduler=scheduler,
+            model_config=model_config,
+            generation_config=generation_config,
+            device=device,
+        )
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        eta: Optional[float] = None,
+        generator: Optional[torch.Generator] = None,
+        output_type: str = "pil",  # "pil", "tensor", "numpy"
+    ) -> Union[List[List[Image.Image]], torch.Tensor, np.ndarray]:
+        """
+        Generate sign language video from text prompt
+        Args:
+            prompt: Text prompt or list of prompts
+            num_inference_steps: Number of denoising steps
+            guidance_scale: Classifier-free guidance scale
+            eta: Stochasticity parameter (0 = deterministic DDIM)
+            generator: Random generator for reproducibility
+            output_type: Type of output ("pil", "tensor", "numpy")
+        Returns:
+            Generated videos in requested format
+        """
+        # Handle single prompt
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        batch_size = len(prompt)
+        # Use default values if not specified
+        if num_inference_steps is None:
+            num_inference_steps = self.generation_config.num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.generation_config.guidance_scale
+        if eta is None:
+            eta = self.generation_config.eta
+        # Tokenize prompts
+        if self.use_clip_text_encoder:
+            text_embeddings = self.text_encoder(tokens=None, text=prompt)
+        else:
+            tokens = self.tokenizer(prompt).to(self.device)
+            text_embeddings = self.text_encoder(tokens)
+        # For classifier-free guidance
+        if guidance_scale > 1.0:
+            if self.use_clip_text_encoder:
+                uncond_embeddings = self.text_encoder(tokens=None, text=[""] * batch_size)
+            else:
+                uncond_tokens = self.tokenizer([""] * batch_size).to(self.device)
+                uncond_embeddings = self.text_encoder(uncond_tokens)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # Set inference timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        # Initialize latents
+        latents_shape = (
+            batch_size,
+            self.model_config.in_channels,
+            self.model_config.num_frames,
+            self.model_config.image_size,
+            self.model_config.image_size,
+        )
+        if generator is not None:
+            latents = torch.randn(latents_shape, generator=generator, device=self.device)
+        else:
+            latents = torch.randn(latents_shape, device=self.device)
+        # Denoising loop
+        for t in tqdm(self.scheduler.timesteps, desc="Generating sign language", leave=True):
+            latent_model_input = latents
+            if guidance_scale > 1.0:
+                latent_model_input = torch.cat([latents] * 2)
+            timestep = torch.tensor([t] * latent_model_input.shape[0], device=self.device)
+            # Predict noise
+            noise_pred = self.model(latent_model_input, timestep, text_embeddings)
+            # Apply classifier-free guidance
+            if guidance_scale > 1.0:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # DDIM step
+            latents, _ = self.scheduler.step(noise_pred, t, latents, eta=eta, generator=generator)
+        # Denormalize
+        videos = (latents + 1) / 2
+        videos = videos.clamp(0, 1)
+        # Convert to output type
+        if output_type == "tensor":
+            return videos
+        elif output_type == "numpy":
+            return videos.cpu().numpy()
+        else:  # "pil"
+            return self._tensor_to_pil(videos)
+    def _tensor_to_pil(self, videos: torch.Tensor) -> List[List[Image.Image]]:
+        """Convert tensor videos to PIL images"""
+        # videos: (B, C, T, H, W)
+        videos = videos.cpu().numpy()
+        all_videos = []
+        for video in videos:
+            # (C, T, H, W) -> (T, H, W, C)
+            frames = video.transpose(1, 2, 3, 0)
+            frames = (frames * 255).astype(np.uint8)
+            pil_frames = [Image.fromarray(frame) for frame in frames]
+            all_videos.append(pil_frames)
+        return all_videos
+    def save_gif(
+        self,
+        frames: List[Image.Image],
+        path: str,
+        fps: Optional[int] = None,
+    ):
+        """
+        Save frames as GIF
+        Args:
+            frames: List of PIL images
+            path: Output path
+            fps: Frames per second
+        """
+        if fps is None:
+            fps = self.generation_config.fps
+        duration = 1000 // fps
+        frames[0].save(
+            path,
+            save_all=True,
+            append_images=frames[1:],
+            duration=duration,
+            loop=0,
+        )
+    def generate_and_save(
+        self,
+        prompt: Union[str, List[str]],
+        output_dir: str,
+        prefix: str = "sign",
+        **kwargs,
+    ) -> List[str]:
+        """
+        Generate and save GIFs
+        Args:
+            prompt: Text prompt(s)
+            output_dir: Directory to save GIFs
+            prefix: Filename prefix
+            **kwargs: Arguments passed to __call__
+        Returns:
+            List of saved file paths
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        videos = self(prompt, **kwargs)
+        saved_paths = []
+        for i, (frames, text) in enumerate(zip(videos, prompt)):
+            # Create filename from prompt
+            safe_text = "".join(c if c.isalnum() else "_" for c in text[:30])
+            filename = f"{prefix}_{i}_{safe_text}.gif"
+            filepath = os.path.join(output_dir, filename)
+            self.save_gif(frames, filepath)
+            saved_paths.append(filepath)
+            print(f"Saved: {filepath}")
+        return saved_paths
+def create_pipeline(
+    model_config: Optional[ModelConfig] = None,
+    ddim_config: Optional[DDIMConfig] = None,
+    generation_config: Optional[GenerationConfig] = None,
+    device: str = "cuda",
+) -> Text2SignPipeline:
+    """
+    Create a new pipeline with untrained models
+    (useful for testing)
+    """
+    if model_config is None:
+        model_config = ModelConfig()
+    if ddim_config is None:
+        ddim_config = DDIMConfig()
+    if generation_config is None:
+        generation_config = GenerationConfig()
+    model = UNet3D(
+        in_channels=model_config.in_channels,
+        model_channels=model_config.model_channels,
+        out_channels=model_config.in_channels,
+        num_res_blocks=model_config.num_res_blocks,
+        attention_resolutions=model_config.attention_resolutions,
+        channel_mult=model_config.channel_mult,
+        num_heads=model_config.num_heads,
+        context_dim=model_config.context_dim,
+    )
+    text_encoder = create_text_encoder(
+        model_config,
+        use_clip=getattr(model_config, "use_clip_text_encoder", False),
+    )
+    scheduler = DDIMScheduler(
+        num_train_timesteps=ddim_config.num_train_timesteps,
+        beta_start=ddim_config.beta_start,
+        beta_end=ddim_config.beta_end,
+        beta_schedule=ddim_config.beta_schedule,
+        clip_sample=ddim_config.clip_sample,
+        prediction_type=ddim_config.prediction_type,
+    )
+    return Text2SignPipeline(
+        model=model,
+        text_encoder=text_encoder,
+        scheduler=scheduler,
+        model_config=model_config,
+        generation_config=generation_config,
+        device=device,
+    )
+if __name__ == "__main__":
+    # Test pipeline
+    print("Creating pipeline...")
+    pipeline = create_pipeline(device="cpu")
+    print("Testing generation...")
+    videos = pipeline(
+        ["Hello", "Thank you"],
+        num_inference_steps=5,
+        guidance_scale=3.0,
+    )
+    print(f"Generated {len(videos)} videos")
+    print(f"Each video has {len(videos[0])} frames")
+    print(f"Frame size: {videos[0][0].size}")

schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Schedulers package for text-to-sign language generation
+"""
+from .ddim import DDIMScheduler, get_ddim_scheduler
+__all__ = [
+    "DDIMScheduler",
+    "get_ddim_scheduler",
+]

schedulers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (342 Bytes). View file

schedulers/__pycache__/ddim.cpython-310.pyc ADDED Viewed

Binary file (7.92 kB). View file

schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+DDIM (Denoising Diffusion Implicit Models) Scheduler
+Implements both training and sampling procedures
+"""
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import numpy as np
+class DDIMScheduler:
+    """
+    DDIM Scheduler for diffusion models
+    Supports both DDPM training and DDIM deterministic/stochastic sampling
+    """
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+    ):
+        """
+        Args:
+            num_train_timesteps: Number of diffusion steps
+            beta_start: Starting beta value
+            beta_end: Ending beta value
+            beta_schedule: Type of beta schedule ("linear" or "cosine")
+            clip_sample: Whether to clip predicted samples
+            prediction_type: What the model predicts ("epsilon" or "v_prediction")
+            thresholding: Whether to use dynamic thresholding
+            dynamic_thresholding_ratio: Ratio for dynamic thresholding
+            sample_max_value: Max value for clipping
+        """
+        self.num_train_timesteps = num_train_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.beta_schedule = beta_schedule
+        self.clip_sample = clip_sample
+        self.prediction_type = prediction_type
+        self.thresholding = thresholding
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.sample_max_value = sample_max_value
+        # Compute betas
+        if beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps)
+        elif beta_schedule == "cosine":
+            self.betas = self._cosine_beta_schedule(num_train_timesteps)
+        elif beta_schedule == "squaredcos_cap_v2":
+            self.betas = self._squaredcos_cap_v2_schedule(num_train_timesteps)
+        else:
+            raise ValueError(f"Unknown beta schedule: {beta_schedule}")
+        # Compute alphas
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
+        # Calculations for diffusion q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
+        # Calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_log_variance_clipped = torch.log(
+            torch.cat([self.posterior_variance[1:2], self.posterior_variance[1:]])
+        )
+        self.posterior_mean_coef1 = (
+            self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * torch.sqrt(self.alphas) / (1.0 - self.alphas_cumprod)
+        )
+        # For sampling
+        self.num_inference_steps = None
+        self.timesteps = None
+    def _cosine_beta_schedule(self, timesteps: int, s: float = 0.008) -> torch.Tensor:
+        """Cosine schedule as proposed in https://arxiv.org/abs/2102.09672"""
+        steps = timesteps + 1
+        x = torch.linspace(0, timesteps, steps)
+        alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * math.pi * 0.5) ** 2
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        return torch.clip(betas, 0.0001, 0.9999)
+    def _squaredcos_cap_v2_schedule(self, timesteps: int) -> torch.Tensor:
+        """Squared cosine schedule used in improved DDPM"""
+        return self._cosine_beta_schedule(timesteps)
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = "cpu"):
+        """
+        Set the timesteps for inference
+        Args:
+            num_inference_steps: Number of steps for inference
+            device: Device to put tensors on
+        """
+        self.num_inference_steps = num_inference_steps
+        # DDIM uses uniform spacing
+        step_ratio = self.num_train_timesteps // num_inference_steps
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+    def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
+        """Compute variance for given timestep"""
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else torch.tensor(1.0)
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        return variance
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Add noise to samples for training
+        Args:
+            original_samples: Clean samples x_0
+            noise: Noise to add
+            timesteps: Timesteps for each sample
+        Returns:
+            Noisy samples x_t
+        """
+        # Move coefficients to correct device and dtype
+        sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(original_samples.device)
+        sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(original_samples.device)
+        sqrt_alpha_prod = sqrt_alphas_cumprod[timesteps]
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alphas_cumprod[timesteps]
+        # Reshape for broadcasting
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Perform one DDIM denoising step
+        Args:
+            model_output: Output from the model (predicted noise or v)
+            timestep: Current timestep
+            sample: Current noisy sample x_t
+            eta: Stochasticity factor (0 = deterministic DDIM, 1 = DDPM)
+            generator: Random generator for reproducibility
+        Returns:
+            Tuple of (predicted x_{t-1}, predicted x_0)
+        """
+        # Get previous timestep
+        prev_timestep = timestep - self.num_train_timesteps // self.num_inference_steps
+        # Get alpha values
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else torch.tensor(1.0)
+        beta_prod_t = 1 - alpha_prod_t
+        # Compute predicted x_0
+        if self.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        elif self.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t ** 0.5) * sample - (beta_prod_t ** 0.5) * model_output
+        else:
+            raise ValueError(f"Unknown prediction type: {self.prediction_type}")
+        # Clip predicted x_0
+        if self.clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+        # Compute variance
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** 0.5
+        # Compute direction pointing to x_t
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t ** 2) ** 0.5 * model_output
+        # Compute x_{t-1}
+        prev_sample = alpha_prod_t_prev ** 0.5 * pred_original_sample + pred_sample_direction
+        # Add noise if eta > 0
+        if eta > 0:
+            device = model_output.device
+            noise = torch.randn(
+                model_output.shape,
+                generator=generator,
+                device=device,
+                dtype=model_output.dtype
+            )
+            prev_sample = prev_sample + std_dev_t * noise
+        return prev_sample, pred_original_sample
+    def get_velocity(
+        self,
+        sample: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute velocity for v-prediction
+        v = sqrt(alpha_t) * noise - sqrt(1 - alpha_t) * sample
+        """
+        sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(sample.device)
+        sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(sample.device)
+        sqrt_alpha_prod = sqrt_alphas_cumprod[timesteps]
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alphas_cumprod[timesteps]
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+# Import F for F.pad
+import torch.nn.functional as F
+def get_ddim_scheduler(config) -> DDIMScheduler:
+    """Create DDIM scheduler from config"""
+    return DDIMScheduler(
+        num_train_timesteps=config.num_train_timesteps,
+        beta_start=config.beta_start,
+        beta_end=config.beta_end,
+        beta_schedule=config.beta_schedule,
+        clip_sample=config.clip_sample,
+        prediction_type=config.prediction_type,
+    )
+if __name__ == "__main__":
+    # Test the scheduler
+    scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_start=0.0001,
+        beta_end=0.02,
+        beta_schedule="linear",
+    )
+    # Test adding noise
+    x = torch.randn(2, 3, 16, 64, 64)
+    noise = torch.randn_like(x)
+    timesteps = torch.tensor([100, 500])
+    noisy_x = scheduler.add_noise(x, noise, timesteps)
+    print(f"Original shape: {x.shape}")
+    print(f"Noisy shape: {noisy_x.shape}")
+    # Test sampling
+    scheduler.set_timesteps(50)
+    print(f"Inference timesteps: {scheduler.timesteps[:10]}...")
+    # Test step
+    model_output = torch.randn_like(x)
+    prev_sample, pred_x0 = scheduler.step(model_output, 500, noisy_x, eta=0.0)
+    print(f"Previous sample shape: {prev_sample.shape}")
+    print(f"Predicted x0 shape: {pred_x0.shape}")