herrscher0 commited on Dec 24, 2025

Commit

e86746e

0 Parent(s):

Initial commit: FloodDiffusionTiny - Tiny text-to-motion model with UMT5-Base

Browse files

Files changed (24) hide show

.gitattributes +36 -0
.gitignore +54 -0
README.md +177 -0
__init__.py +11 -0
config.json +11 -0
generate_ldf.py +139 -0
hf_pipeline.py +282 -0
ldf.yaml +44 -0
ldf_models/__init__.py +0 -0
ldf_models/diffusion_forcing_wan_tiny.py +943 -0
ldf_models/tools/attention.py +188 -0
ldf_models/tools/t5.py +564 -0
ldf_models/tools/tokenizers.py +84 -0
ldf_models/tools/wan_model.py +592 -0
ldf_models/tools/wan_vae_1d.py +762 -0
ldf_models/vae_wan_1d.py +212 -0
ldf_utils/__init__.py +0 -0
ldf_utils/initialize.py +286 -0
ldf_utils/math/__init__.py +0 -0
ldf_utils/math/quaternion.py +447 -0
ldf_utils/motion_process.py +365 -0
model.safetensors +3 -0
requirements.txt +19 -0
vae.safetensors +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+ldf_deps/t5_umt5-xxl-enc-bf16/google/umt5-xxl/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,54 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+# PyTorch/Model cache
+*.pth~
+*.safetensors~
+checkpoint/
+checkpoints/
+# Hugging Face cache
+.cache/
+huggingface_cache/
+# Generated outputs
+outputs/
+generated_motions/
+*.npy
+*.pkl
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+# Logs
+*.log
+logs/
+wandb/
+# Test outputs
+test_output/
+test_results/
+tmp/

README.md ADDED Viewed

	@@ -0,0 +1,177 @@

+---
+license: apache-2.0
+tags:
+- text-to-motion
+- motion-generation
+- diffusion-forcing
+- humanml3d
+- computer-animation
+library_name: transformers
+pipeline_tag: other
+---
+# FloodDiffusion: Tailored Diffusion Forcing for Streaming Motion Generation
+<div align="center">
+**A TINY version of the original FloodDiffusion**
+[Paper](https://arxiv.org/abs/2512.03520) | [Github](https://github.com/ShandaAI/FloodDiffusion) | [Project Page](https://shandaai.github.io/FloodDiffusion/)
+</div>
+## Installation
+### Prerequisites
+- Python 3.8+
+- CUDA-capable GPU with 16GB+ VRAM (recommended)
+- 16GB+ system RAM
+### Dependencies
+**Step 1: Install basic dependencies**
+```bash
+pip install torch transformers huggingface_hub
+pip install lightning diffusers omegaconf ftfy numpy
+```
+**Step 2: Install Flash Attention (Required)**
+Flash attention requires CUDA and may need compilation. Choose the appropriate method:
+```bash
+pip install flash-attn --no-build-isolation
+```
+**Note:** Flash attention is **required** for this model. If installation fails, please refer to the [official flash-attention installation guide](https://github.com/Dao-AILab/flash-attention#installation-and-features).
+## Quick Start
+### Basic Usage
+```python
+from transformers import AutoModel
+# Load model
+model = AutoModel.from_pretrained(
+    "ShandaAI/FloodDiffusionTiny",
+    trust_remote_code=True
+)
+# Generate motion from text (263-dim HumanML3D features)
+motion = model("a person walking forward", length=60)
+print(f"Generated motion: {motion.shape}")  # (~240, 263)
+# Generate motion as joint coordinates (22 joints × 3 coords) with ema (alpha: 0.0-1.0)
+motion_joints = model("a person walking forward", length=60, output_joints=True, smoothing_alpha=0.5)
+print(f"Generated joints: {motion_joints.shape}")  # (~240, 22, 3)
+```
+### Batch Generation
+```python
+# Generate multiple motions efficiently
+texts = [
+    "a person walking forward",
+    "a person running quickly",
+    "a person jumping up and down"
+]
+lengths = [60, 50, 40]  # Different lengths for each motion
+motions = model(texts, length=lengths)
+for i, motion in enumerate(motions):
+    print(f"Motion {i}: {motion.shape}")
+```
+### Multi-Text Motion Transitions
+```python
+# Generate a motion sequence with smooth transitions between actions
+motion = model(
+    text=[["walk forward", "turn around", "run back"]],
+    length=[120],
+    text_end=[[40, 80, 120]]  # Transition points in latent tokens
+)
+# Output: ~480 frames showing all three actions smoothly connected
+print(f"Transition motion: {motion[0].shape}")
+```
+## API Reference
+### `model(text, length=60, text_end=None, num_denoise_steps=None, output_joints=False, smoothing_alpha=1.0)`
+Generate motion sequences from text descriptions.
+**Parameters:**
+- **text** (`str`, `List[str]`, or `List[List[str]]`): Text description(s)
+  - Single string: Generate one motion
+  - List of strings: Batch generation
+  - Nested list: Multiple text prompts per motion (for transitions)
+- **length** (`int` or `List[int]`, default=60): Number of latent tokens to generate
+  - Output frames ≈ `length × 4` (due to VAE upsampling)
+  - Example: `length=60` → ~240 frames (~12 seconds at 20 FPS)
+- **text_end** (`List[int]` or `List[List[int]]`, optional): Latent token positions for text transitions
+  - Only used when `text` is a nested list
+  - Specifies when to switch between different text descriptions
+  - **IMPORTANT**: Must have the same length as the corresponding text list
+    - Example: `text=[["walk", "turn", "sit"]]` requires `text_end=[[20, 40, 60]]` (3 endpoints for 3 texts)
+  - Must be in ascending order
+- **num_denoise_steps** (`int`, optional): Number of denoising iterations
+  - Higher values produce better quality but slower generation
+  - Recommended range: 10-50
+- **output_joints** (`bool`, default=False): Output format selector
+  - `False`: Returns 263-dimensional HumanML3D features
+  - `True`: Returns 22×3 joint coordinates for direct visualization
+- **smoothing_alpha** (`float`, default=1.0): EMA smoothing factor for joint positions (only used when `output_joints=True`)
+  - `1.0`: No smoothing (default)
+  - `0.5`: Medium smoothing (recommended for smoother animations)
+  - `0.0`: Maximum smoothing
+  - Range: 0.0 to 1.0
+**Returns:**
+- Single motion:
+  - `output_joints=False`: `numpy.ndarray` of shape `(frames, 263)`
+  - `output_joints=True`: `numpy.ndarray` of shape `(frames, 22, 3)`
+- Batch: `List[numpy.ndarray]` with shapes as above
+**Example:**
+```python
+# Single generation (263-dim features)
+motion = model("walk forward", length=60)  # Returns (240, 263)
+# Single generation (joint coordinates)
+joints = model("walk forward", length=60, output_joints=True)  # Returns (240, 22, 3)
+# Batch generation
+motions = model(["walk", "run"], length=[60, 50])  # Returns list of 2 arrays
+# Multi-text transitions
+motion = model(
+    [["walk", "turn"]],
+    length=[60],
+    text_end=[[30, 60]]
+)  # Returns list with 1 array of shape (240, 263)
+```
+## Citation
+If you use this model in your research, please cite:
+```bibtex
+@article{cai2025flooddiffusion,
+  title={FloodDiffusion: Tailored Diffusion Forcing for Streaming Motion Generation},
+  author={Yiyi Cai, Yuhan Wu, Kunhang Li, You Zhou, Bo Zheng, Haiyang Liu},
+  journal={arXiv preprint arXiv:2512.03520},
+  year={2025}
+}
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+FloodDiffusion - Text-to-Motion Generation
+Usage:
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained("your-username/FloodDiffusion", trust_remote_code=True)
+    motion = model("a person walking forward", length=60)
+"""
+__version__ = "1.0.0"

config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "architectures": ["LDFModel"],
+  "model_type": "ldf_motion",
+  "auto_map": {
+    "AutoModel": "hf_pipeline.LDFModel",
+    "AutoConfig": "hf_pipeline.LDFConfig"
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.0",
+  "license": "mit"
+}

generate_ldf.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import sys
+import torch
+from lightning import seed_everything
+from safetensors.torch import load_file as load_safetensors
+from ldf_utils.initialize import compare_statedict_and_parameters, instantiate, load_config
+# Set tokenizers parallelism to false to avoid warnings in multiprocessing
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+def load_model_from_config():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch.set_float32_matmul_precision("high")
+    cfg = load_config()
+    seed_everything(cfg.seed)
+    # Get the directory containing the config file
+    # Try to find config directory from sys.argv or use current directory
+    if '--config' in sys.argv:
+        config_idx = sys.argv.index('--config') + 1
+        config_dir = os.path.dirname(os.path.abspath(sys.argv[config_idx]))
+    else:
+        config_dir = os.getcwd()
+    vae = instantiate(
+        target=cfg.test_vae.target,
+        cfg=None,
+        hfstyle=False,
+        **cfg.test_vae.params,
+    )
+    # Handle relative paths
+    vae_path = cfg.test_vae_ckpt
+    if not os.path.isabs(vae_path):
+        vae_path = os.path.join(config_dir, vae_path)
+    # Load from safetensors (already contains EMA weights)
+    vae_state_dict = load_safetensors(vae_path)
+    vae.load_state_dict(vae_state_dict, strict=True)
+    print(f"Loaded VAE model from {vae_path}")
+    compare_statedict_and_parameters(
+        state_dict=vae.state_dict(),
+        named_parameters=vae.named_parameters(),
+        named_buffers=vae.named_buffers(),
+    )
+    vae.to(device)
+    vae.eval()
+    # Model - fix relative paths in model params
+    model_params = dict(cfg.model.params)
+    # Convert relative paths to absolute paths
+    if 'checkpoint_path' in model_params and model_params['checkpoint_path']:
+        if not os.path.isabs(model_params['checkpoint_path']):
+            model_params['checkpoint_path'] = os.path.join(config_dir, model_params['checkpoint_path'])
+    if 'tokenizer_path' in model_params and model_params['tokenizer_path']:
+        if not os.path.isabs(model_params['tokenizer_path']):
+            model_params['tokenizer_path'] = os.path.join(config_dir, model_params['tokenizer_path'])
+    model = instantiate(
+        target=cfg.model.target, cfg=None, hfstyle=False, **model_params
+    )
+    # Handle relative paths
+    model_path = cfg.test_ckpt
+    if not os.path.isabs(model_path):
+        model_path = os.path.join(config_dir, model_path)
+    # Load from safetensors (already contains EMA weights)
+    model_state_dict = load_safetensors(model_path)
+    model.load_state_dict(model_state_dict, strict=True)
+    print(f"Loaded model from {model_path}")
+    compare_statedict_and_parameters(
+        state_dict=model.state_dict(),
+        named_parameters=model.named_parameters(),
+        named_buffers=model.named_buffers(),
+    )
+    model.to(device)
+    model.eval()
+    return vae, model
+@torch.inference_mode()
+def generate_feature_stream(
+    model, feature_length, text, feature_text_end=None, num_denoise_steps=None
+):
+    """
+    Streaming interface for feature generation
+    Args:
+        model: Loaded model
+        feature_length: List[int], generation length for each sample
+        text: List[str] or List[List[str]], text prompts
+        feature_text_end: List[List[int]], time points where text ends (if text is list of list)
+        num_denoise_steps: Number of denoising steps
+    Yields:
+        dict: Contains "generated" (current generated feature segment)
+    """
+    # Construct input dict x
+    # stream_generate needs x to contain "feature_length", "text", "feature_text_end" (if text is list of list)
+    x = {"feature_length": torch.tensor(feature_length), "text": text}
+    if feature_text_end is not None:
+        x["feature_text_end"] = feature_text_end
+    # Call model's stream_generate
+    # Note: stream_generate is a generator
+    generator = model.stream_generate(x, num_denoise_steps=num_denoise_steps)
+    for step_output in generator:
+        # step_output is already a dict with "generated" key
+        yield step_output
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to config")
+    parser.add_argument(
+        "--text", type=str, default="a person walks forward", help="Text prompt"
+    )
+    parser.add_argument("--length", type=int, default=120, help="Motion length")
+    parser.add_argument(
+        "--output", type=str, default="output.mp4", help="Output video path"
+    )
+    parser.add_argument(
+        "--num_denoise_steps", type=int, default=None, help="Number of denoising steps"
+    )
+    args = parser.parse_args()
+    print("Loading model...")
+    vae, model = load_model_from_config()

hf_pipeline.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+LDF Model for Hugging Face Hub
+Usage:
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained("ShandaAI/FloodDiffusion", trust_remote_code=True)
+    motion = model("a person walking forward", length=60)
+"""
+import torch
+from transformers import PretrainedConfig, PreTrainedModel
+from typing import Union, List, Optional
+import os
+import sys
+class LDFConfig(PretrainedConfig):
+    """Configuration for LDF Motion Generation Model"""
+    model_type = "ldf_motion"
+    def __init__(
+        self,
+        input_dim=4,
+        output_dim=263,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+class LDFModel(PreTrainedModel):
+    """
+    LDF Motion Generation Model
+    This model generates motion sequences from text descriptions using Latent Diffusion Forcing.
+    Example:
+        >>> from transformers import AutoModel
+        >>> model = AutoModel.from_pretrained("ShandaAI/FloodDiffusion", trust_remote_code=True)
+        >>> motion = model("a person walking forward", length=60)
+        >>> print(motion.shape)  # (~240, 263)
+    """
+    config_class = LDFConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Will be loaded in from_pretrained
+        self.ldf_model = None
+        self.vae = None
+        self.model_dir = None  # Store model directory for later use
+    def _load_models(self):
+        """Load the actual LDF and VAE models"""
+        if self.ldf_model is not None:
+            return  # Already loaded
+        # Get the model directory - should be set by from_pretrained
+        if hasattr(self, 'name_or_path') and os.path.exists(self.name_or_path):
+            model_dir = self.name_or_path
+        else:
+            raise RuntimeError(
+                "Model directory not found. Please use from_pretrained() to load the model."
+            )
+        # Save model_dir for later use (e.g., in output_joints conversion)
+        self.model_dir = model_dir
+        # Add model_dir to sys.path for imports
+        if model_dir not in sys.path:
+            sys.path.insert(0, model_dir)
+        # Use dynamic import to avoid HF's static import checker
+        import importlib
+        generate_ldf = importlib.import_module('generate_ldf')
+        load_model_from_config = generate_ldf.load_model_from_config
+        config_path = os.path.join(model_dir, "ldf.yaml")
+        old_argv = sys.argv
+        sys.argv = ['model', '--config', config_path]
+        try:
+            self.vae, self.ldf_model = load_model_from_config()
+            # Move to correct device
+            device = next(self.parameters()).device if list(self.parameters()) else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            self.ldf_model = self.ldf_model.to(device)
+            self.vae = self.vae.to(device)
+        finally:
+            sys.argv = old_argv
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Load pretrained model
+        Args:
+            pretrained_model_name_or_path: Model name or path
+            trust_remote_code: Must be True to load this custom model
+            **kwargs: Additional arguments
+        Returns:
+            LDFModel instance
+        """
+        # Check trust_remote_code
+        if not kwargs.get('trust_remote_code', False):
+            raise ValueError(
+                "Loading this model requires trust_remote_code=True. "
+                "Usage: AutoModel.from_pretrained(..., trust_remote_code=True)"
+            )
+        # Download if needed
+        if not os.path.exists(pretrained_model_name_or_path):
+            from huggingface_hub import snapshot_download
+            model_path = snapshot_download(repo_id=pretrained_model_name_or_path)
+        else:
+            model_path = pretrained_model_name_or_path
+        # Load config
+        config = LDFConfig.from_pretrained(model_path)
+        # Create model
+        model = cls(config)
+        model.name_or_path = model_path
+        # Load the actual models
+        model._load_models()
+        return model
+    def forward(
+        self,
+        text: Union[str, List[str], List[List[str]]],
+        length: Union[int, List[int]] = 60,
+        text_end: Optional[Union[List[int], List[List[int]]]] = None,
+        num_denoise_steps: Optional[int] = None,
+        **kwargs
+    ):
+        """
+        Generate motion from text
+        Args:
+            text: Text description(s)
+            length: Number of latent tokens (output frames ≈ length × 4)
+            text_end: Transition points for multi-text
+            num_denoise_steps: Number of denoising steps
+        Returns:
+            Generated motion sequence(s)
+        """
+        return self.__call__(text, length, text_end, num_denoise_steps)
+    @torch.no_grad()
+    def __call__(
+        self,
+        text: Union[str, List[str], List[List[str]]],
+        length: Union[int, List[int]] = 60,
+        text_end: Optional[Union[List[int], List[List[int]]]] = None,
+        num_denoise_steps: Optional[int] = None,
+        output_joints: bool = False,
+        smoothing_alpha: float = 1.0
+    ):
+        """
+        Generate motion sequences
+        Args:
+            text: Text description
+                - Single string: "walk" -> single sample
+                - String list: ["walk", "run"] -> batch
+                - Nested list: [["walk", "turn"], ["run", "jump"]] -> multi-text per sample
+            length: Number of latent tokens (frames ≈ length × 4)
+            text_end: Token positions for text switching
+            num_denoise_steps: Number of denoising steps
+            output_joints: If True, output 22×3 joint coordinates; if False (default), output 263-dim HumanML3D features
+            smoothing_alpha: EMA smoothing factor for joint positions (0.0-1.0, default=1.0 no smoothing)
+                - Only used when output_joints=True
+                - Recommended: 0.5 for smoother animations
+        Returns:
+            numpy.ndarray or list of arrays
+                - If output_joints=False: shape (frames, 263)
+                - If output_joints=True: shape (frames, 22, 3)
+        """
+        # Ensure models are loaded
+        self._load_models()
+        # Normalize inputs
+        is_single = not isinstance(length, list)
+        if is_single:
+            text_batch = [text]
+            length_batch = [length]
+            text_end_batch = [text_end] if text_end is not None else None
+        else:
+            text_batch = text
+            length_batch = length
+            text_end_batch = text_end
+        # Validate text_end alignment with text
+        if text_end_batch is not None:
+            for i, (txt, te) in enumerate(zip(text_batch, text_end_batch)):
+                if isinstance(txt, list) and te is not None:
+                    if len(txt) != len(te):
+                        raise ValueError(
+                            f"Batch {i}: text has {len(txt)} segments but text_end has {len(te)} endpoints. "
+                            f"They must match! text={txt}, text_end={te}"
+                        )
+        batch_size = len(text_batch)
+        # Construct input dict for model
+        x = {"feature_length": torch.tensor(length_batch), "text": text_batch}
+        if text_end_batch is not None:
+            x["feature_text_end"] = text_end_batch
+        # Non-streaming generate (following generate_ldf.py 125-139)
+        output = self.ldf_model.generate(x, num_denoise_steps=num_denoise_steps)
+        generated_batch = output["generated"]
+        # Decode with VAE and optionally convert to joints
+        decoded_results = []
+        joints_results = [] if output_joints else None
+        # Import motion processing module once if needed
+        if output_joints:
+            import importlib.util
+            import numpy as np
+            utils_spec = importlib.util.spec_from_file_location(
+                "motion_process",
+                os.path.join(self.model_dir, "ldf_utils", "motion_process.py")
+            )
+            motion_process_module = importlib.util.module_from_spec(utils_spec)
+            utils_spec.loader.exec_module(motion_process_module)
+        for i, generated in enumerate(generated_batch):
+            if generated is not None and torch.is_tensor(generated):
+                # Decode with VAE (following generate_ldf.py line 130)
+                decoded_g = self.vae.decode(generated[None, :])[0]
+                if output_joints:
+                    # Convert to joints using StreamJointRecovery263 with smoothing
+                    # Create a new recovery instance for each sample to maintain independent state
+                    decoded_np = decoded_g.cpu().numpy()
+                    recovery = motion_process_module.StreamJointRecovery263(
+                        joints_num=22, smoothing_alpha=smoothing_alpha
+                    )
+                    joints = [recovery.process_frame(frame) for frame in decoded_np]
+                    joints = np.array(joints)
+                    joints_results.append(joints)
+                else:
+                    decoded_results.append(decoded_g.cpu().numpy())
+            else:
+                if output_joints:
+                    joints_results.append(None)
+                else:
+                    decoded_results.append(None)
+        # Return results
+        if output_joints:
+            return joints_results[0] if is_single else joints_results
+        else:
+            return decoded_results[0] if is_single else decoded_results
+    def generate(self, *args, **kwargs):
+        """Alias for __call__ to match transformers API"""
+        return self.__call__(*args, **kwargs)
+# For backwards compatibility
+LDFPipeline = LDFModel
+# Register with AutoModel
+try:
+    from transformers import AutoModel, AutoConfig
+    AutoConfig.register("ldf_motion", LDFConfig)
+    AutoModel.register(LDFConfig, LDFModel)
+except:
+    pass

ldf.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+exp_name: ldf
+seed: 1234
+debug: false
+train: false
+save_dir: ./outputs
+resume_ckpt: null
+test_ckpt: "model.safetensors"
+test_vae_ckpt: "vae.safetensors"
+test_vae:
+    target: ldf_models.vae_wan_1d.VAEWanModel
+    ema_decay: 0.99
+    params:
+        input_dim: 263
+        z_dim: 4
+test_setting:
+    render: false
+    simple: true
+    recover_dim: 263
+val_repeat: 1
+model:
+    target: ldf_models.diffusion_forcing_wan_tiny.DiffForcingWanModel
+    ema_decay: 0.99
+    params:
+        model_name: "google/umt5-base"
+        input_dim: 4
+        noise_steps: 10
+        hidden_dim: 256
+        ffn_dim: 1024
+        freq_dim: 64
+        num_heads: 8
+        num_layers: 8
+        time_embedding_scale: 1.0
+        chunk_size: 5
+        use_text_cond: True
+        text_len: 128
+        drop_out: 0.1
+        cfg_scale: 5.0
+        prediction_type: "vel"
+        causal: False

ldf_models/__init__.py ADDED Viewed

File without changes

ldf_models/diffusion_forcing_wan_tiny.py ADDED Viewed

	@@ -0,0 +1,943 @@

+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+from .tools.wan_model import WanModel
+class HFT5Encoder:
+    """Wrapper for HuggingFace T5 encoder, compatible with original T5EncoderModel interface"""
+    def __init__(self, text_len, dtype=torch.float32, device=torch.device("cpu"), model_name="google/umt5-base"):
+        self.text_len = text_len
+        self.dtype = dtype
+        self.device = device
+        print(f"Loading {model_name} from HuggingFace...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(
+            model_name,
+            dtype=dtype
+        ).encoder  # Only use the encoder part
+        self.model.eval()
+        self.model.requires_grad_(False)
+        self.model.to(device)
+    def __call__(self, texts, device):
+        """Encode texts, returns list of tensors (one per text, with padding removed)"""
+        # Tokenize
+        inputs = self.tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=self.text_len,
+            return_tensors="pt"
+        )
+        ids = inputs.input_ids.to(device)
+        mask = inputs.attention_mask.to(device)
+        # Encode (model should already be on device via external .model.to(device) call)
+        context = self.model(input_ids=ids, attention_mask=mask).last_hidden_state
+        # Get sequence lengths (excluding padding)
+        seq_lens = mask.sum(dim=1).long()
+        # Return list of tensors with padding removed (same as original T5EncoderModel)
+        return [u[:v] for u, v in zip(context, seq_lens)]
+class DiffForcingWanModel(nn.Module):
+    def __init__(
+        self,
+        model_name="google/umt5-base",  # HuggingFace model name
+        input_dim=256,
+        hidden_dim=1024,
+        ffn_dim=2048,
+        freq_dim=256,
+        num_heads=8,
+        num_layers=8,
+        time_embedding_scale=1.0,
+        chunk_size=5,
+        noise_steps=10,
+        use_text_cond=True,
+        text_len=512,
+        drop_out=0.1,
+        cfg_scale=5.0,
+        prediction_type="vel",  # "vel", "x0", "noise"
+        causal=False,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.time_embedding_scale = time_embedding_scale
+        self.chunk_size = chunk_size
+        self.noise_steps = noise_steps
+        self.use_text_cond = use_text_cond
+        self.drop_out = drop_out
+        self.cfg_scale = cfg_scale
+        self.prediction_type = prediction_type
+        self.causal = causal
+        self.text_dim = 768  # umt5-base hidden size
+        self.text_len = text_len
+        self.model_name = model_name
+        # Load model and tokenizer from HuggingFace
+        print(f"Loading {model_name} from HuggingFace...")
+        self.text_encoder = HFT5Encoder(
+            text_len=text_len,
+            dtype=torch.bfloat16,
+            device=torch.device("cpu"),
+            model_name=model_name,
+        )
+        # Text encoding cache
+        self.text_cache = {}
+        self.model = WanModel(
+            model_type="t2v",
+            patch_size=(1, 1, 1),
+            text_len=self.text_len,
+            in_dim=self.input_dim,
+            dim=self.hidden_dim,
+            ffn_dim=self.ffn_dim,
+            freq_dim=self.freq_dim,
+            text_dim=self.text_dim,
+            out_dim=self.input_dim,
+            num_heads=self.num_heads,
+            num_layers=self.num_layers,
+            window_size=(-1, -1),
+            qk_norm=True,
+            cross_attn_norm=True,
+            eps=1e-6,
+            causal=self.causal,
+        )
+        self.param_dtype = torch.float32
+    def encode_text_with_cache(self, text_list, device):
+        """Encode text using cache
+        Args:
+            text_list: List[str], list of texts
+            device: torch.device
+        Returns:
+            List[Tensor]: List of encoded text features
+        """
+        text_features = []
+        indices_to_encode = []
+        texts_to_encode = []
+        # Check cache
+        for i, text in enumerate(text_list):
+            if text in self.text_cache:
+                # Get from cache and move to correct device
+                cached_feature = self.text_cache[text].to(device)
+                text_features.append(cached_feature)
+            else:
+                # Need to encode
+                text_features.append(None)
+                indices_to_encode.append(i)
+                texts_to_encode.append(text)
+        # Batch encode uncached texts
+        if texts_to_encode:
+            self.text_encoder.model.to(device)
+            encoded = self.text_encoder(texts_to_encode, device)
+            # Store in cache and update results
+            for idx, text, feature in zip(indices_to_encode, texts_to_encode, encoded):
+                # Cache to CPU to save GPU memory
+                self.text_cache[text] = feature.cpu()
+                text_features[idx] = feature
+        return text_features
+    def preprocess(self, x):
+        # (bs, T, C) -> (bs, C, T, 1, 1)
+        x = x.permute(0, 2, 1)[:, :, :, None, None]
+        return x
+    def postprocess(self, x):
+        # (bs, C, T, 1, 1) ->  (bs, T, C)
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(x.size(0), x.size(2), -1)
+        return x
+    def _get_noise_levels(self, device, seq_len, time_steps):
+        """Get noise levels"""
+        # noise_level[i] = clip(1 + i / chunk_size - time_steps, 0, 1)
+        noise_level = torch.clamp(
+            1
+            + torch.arange(seq_len, device=device) / self.chunk_size
+            - time_steps.unsqueeze(1),
+            min=0.0,
+            max=1.0,
+        )
+        return noise_level
+    def add_noise(self, x, noise_level):
+        """Add noise
+        Args:
+            x: (B, T, D)
+            noise_level: (B, T)
+        """
+        noise = torch.randn_like(x)
+        # noise_level: (B, T) -> (B, T, 1)
+        noise_level = noise_level.unsqueeze(-1)
+        noisy_x = x * (1 - noise_level) + noise_level * noise
+        return noisy_x, noise
+    def forward(self, x):
+        feature = x["feature"]  # (B, T, C)
+        feature_length = x["feature_length"]  # (B,)
+        batch_size, seq_len, _ = feature.shape
+        device = feature.device
+        # Randomly use a time step
+        time_steps = []
+        for i in range(batch_size):
+            valid_len = feature_length[i].item()
+            # Random float from 0 to valid_len/chunk_size, not an integer
+            max_time = valid_len / self.chunk_size
+            # max_time = valid_len / self.chunk_size + 1
+            time_steps.append(torch.FloatTensor(1).uniform_(0, max_time).item())
+        time_steps = torch.tensor(time_steps, device=device)  # (B,)
+        noise_level = self._get_noise_levels(device, seq_len, time_steps)  # (B, T)
+        # # Debug: Print noise levels
+        # print("Time steps and corresponding noise levels:")
+        # for i in range(batch_size):
+        #     t = time_steps[i].item()
+        #     # Get noise level at each position
+        #     start_idx = int(self.chunk_size * (t - 1))
+        #     end_idx = int(self.chunk_size * t) + 2
+        #     # Limit to valid range
+        #     start_idx = max(0, start_idx)
+        #     end_idx = min(seq_len, end_idx)
+        #     print(time_steps[i])
+        #     print(noise_level[i, start_idx:end_idx])
+        # Add noise to entire sequence
+        noisy_feature, noise = self.add_noise(feature, noise_level)  # (B, T, D)
+        # Debug: Print noise addition information
+        # print("Added noise levels at chunk positions:")
+        # for i in range(batch_size):
+        #     t = time_steps[i].item()
+        #     start_idx = int(self.chunk_size * (t - 1))
+        #     end_idx = int(self.chunk_size * t) + 2
+        #     # Limit to valid range
+        #     start_idx = max(0, start_idx)
+        #     end_idx = min(seq_len, end_idx)
+        #     test1 = (
+        #         feature[i, start_idx:end_idx, :] - noisy_feature[i, start_idx:end_idx, :]
+        #     )
+        #     test2 = (
+        #         noise[i, start_idx:end_idx, :] - noisy_feature[i, start_idx:end_idx, :]
+        #     )
+        #     # Compute length on last dimension
+        #     print(test1.norm(dim=-1))
+        #     print(test2.norm(dim=-1))
+        feature = self.preprocess(feature)  # (B, C, T, 1, 1)
+        noisy_feature = self.preprocess(noisy_feature)  # (B, C, T, 1, 1)
+        noise = self.preprocess(noise)  # (B, C, T, 1, 1)
+        feature_ref = []
+        noise_ref = []
+        noisy_feature_input = []
+        for i in range(batch_size):
+            t = time_steps[i].item()
+            end_index = int(self.chunk_size * t) + 1
+            valid_len = feature_length[i].item()
+            end_index = min(valid_len, end_index)
+            feature_ref.append(feature[i, :, :end_index, ...])
+            noise_ref.append(noise[i, :, :end_index, ...])
+            noisy_feature_input.append(noisy_feature[i, :, :end_index, ...])
+        # Encode text condition (using cache)
+        if self.use_text_cond and "text" in x:
+            text_list = x["text"]  # List[str] or List[List[str]]
+            if isinstance(text_list[0], list):
+                text_end_list = x["feature_text_end"]
+                all_text_context = []
+                for single_text_list, single_text_end_list in zip(
+                    text_list, text_end_list
+                ):
+                    if np.random.rand() > self.drop_out:
+                        single_text_end_list = [0] + [
+                            min(t, seq_len) for t in single_text_end_list
+                        ]
+                    else:
+                        single_text_list = [""]
+                        single_text_end_list = [0, seq_len]
+                    single_text_length_list = [
+                        t - b
+                        for t, b in zip(
+                            single_text_end_list[1:], single_text_end_list[:-1]
+                        )
+                    ]
+                    single_text_context = self.encode_text_with_cache(
+                        single_text_list, device
+                    )
+                    single_text_context = [
+                        u.to(self.param_dtype) for u in single_text_context
+                    ]
+                    for u, duration in zip(
+                        single_text_context, single_text_length_list
+                    ):
+                        all_text_context.extend([u for _ in range(duration)])
+                    all_text_context.extend(
+                        [
+                            single_text_context[-1]
+                            for _ in range(seq_len - single_text_end_list[-1])
+                        ]
+                    )
+            else:
+                all_text_context = [
+                    (u if np.random.rand() > self.drop_out else "") for u in text_list
+                ]
+                all_text_context = self.encode_text_with_cache(all_text_context, device)
+                all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        else:
+            all_text_context = [""] * batch_size
+            all_text_context = self.encode_text_with_cache(all_text_context, device)
+            all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        # Through WanModel
+        predicted_result = self.model(
+            noisy_feature_input,
+            noise_level * self.time_embedding_scale,
+            all_text_context,
+            seq_len,
+            y=None,
+        )  # (B, C, T, 1, 1)
+        loss = 0.0
+        for b in range(batch_size):
+            if self.prediction_type == "vel":
+                vel = feature_ref[b] - noise_ref[b]  # (C, input_length, 1, 1)
+                squared_error = (
+                    predicted_result[b][:, -self.chunk_size :, ...]
+                    - vel[:, -self.chunk_size :, ...]
+                ) ** 2
+            elif self.prediction_type == "x0":
+                squared_error = (
+                    predicted_result[b][:, -self.chunk_size :, ...]
+                    - feature_ref[b][:, -self.chunk_size :, ...]
+                ) ** 2
+            elif self.prediction_type == "noise":
+                squared_error = (
+                    predicted_result[b][:, -self.chunk_size :, ...]
+                    - noise_ref[b][:, -self.chunk_size :, ...]
+                ) ** 2
+            sample_loss = squared_error.sum().mean()
+            loss += sample_loss
+        loss = loss / batch_size
+        loss_dict = {"total": loss, "mse": loss}
+        return loss_dict
+    def generate(self, x, num_denoise_steps=None):
+        """
+        Generation - Diffusion Forcing inference
+        Uses triangular noise schedule, progressively generating from left to right
+        Generation process:
+        1. Start from t=0, gradually increase t
+        2. Each t corresponds to a noise schedule: clean on left, noisy on right, gradient in middle
+        3. After each denoising step, t increases slightly and continues
+        """
+        feature_length = x["feature_length"]
+        batch_size = len(feature_length)
+        seq_len = max(feature_length).item()
+        # # debug
+        # x["text"] = [["walk forward.", "sit down.", "stand up."] for _ in range(batch_size)]
+        # x["feature_text_end"] = [[1, 2, 3] for _ in range(batch_size)]
+        # text = x["text"]
+        # text_end = x["feature_text_end"]
+        # print(text)
+        # print(text_end)
+        # print(batch_size, seq_len, self.chunk_size)
+        if num_denoise_steps is None:
+            num_denoise_steps = self.noise_steps
+        assert num_denoise_steps % self.chunk_size == 0
+        device = next(self.parameters()).device
+        # Initialize entire sequence as pure noise
+        generated = torch.randn(
+            batch_size, seq_len + self.chunk_size, self.input_dim, device=device
+        )
+        generated = self.preprocess(generated)  # (B, C, T, 1, 1)
+        # Calculate total number of time steps needed
+        max_t = 1 + (seq_len - 1) / self.chunk_size
+        # Step size for each advancement
+        dt = 1 / num_denoise_steps
+        total_steps = int(max_t / dt)
+        # Encode text condition (using cache)
+        if self.use_text_cond and "text" in x:
+            text_list = x["text"]  # List[str] or List[List[str]]
+            if isinstance(text_list[0], list):
+                generated_length = []
+                text_end_list = x["feature_text_end"]
+                full_text = []
+                all_text_context = []
+                for single_text_list, single_text_end_list in zip(
+                    text_list, text_end_list
+                ):
+                    single_text_end_list = [0] + [
+                        min(t, seq_len) for t in single_text_end_list
+                    ]
+                    generated_length.append(single_text_end_list[-1])
+                    single_text_length_list = [
+                        t - b
+                        for t, b in zip(
+                            single_text_end_list[1:], single_text_end_list[:-1]
+                        )
+                    ]
+                    full_text.append(
+                        " ////////// ".join(
+                            [
+                                f"{u} //dur:{t}"
+                                for u, t in zip(
+                                    single_text_list, single_text_length_list
+                                )
+                            ]
+                        )
+                    )
+                    single_text_context = self.encode_text_with_cache(
+                        single_text_list, device
+                    )
+                    single_text_context = [
+                        u.to(self.param_dtype) for u in single_text_context
+                    ]
+                    for u, duration in zip(
+                        single_text_context, single_text_length_list
+                    ):
+                        all_text_context.extend([u for _ in range(duration)])
+                    all_text_context.extend(
+                        [
+                            single_text_context[-1]
+                            for _ in range(
+                                seq_len + self.chunk_size - single_text_end_list[-1]
+                            )
+                        ]
+                    )
+            else:
+                generated_length = feature_length
+                full_text = text_list
+                all_text_context = self.encode_text_with_cache(text_list, device)
+                all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        else:
+            generated_length = feature_length
+            full_text = [""] * batch_size
+            all_text_context = [""] * batch_size
+            all_text_context = self.encode_text_with_cache(all_text_context, device)
+            all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        # Get empty text condition encoding (for CFG)
+        text_null_list = [""] * batch_size
+        text_null_context = self.encode_text_with_cache(text_null_list, device)
+        text_null_context = [u.to(self.param_dtype) for u in text_null_context]
+        # print(len(all_text_context), len(text_null_context))
+        # Progressively advance from t=0 to t=max_t
+        for step in range(total_steps):
+            # Current time step
+            t = step * dt
+            start_index = max(0, int(self.chunk_size * (t - 1)) + 1)
+            end_index = int(self.chunk_size * t) + 1
+            time_steps = torch.full((batch_size,), t, device=device)
+            # Calculate current noise schedule
+            noise_level = self._get_noise_levels(
+                device, seq_len + self.chunk_size, time_steps
+            )  # (B, T)
+            # Predict noise through WanModel
+            noisy_input = []
+            for i in range(batch_size):
+                noisy_input.append(generated[i, :, :end_index, ...])
+            predicted_result = self.model(
+                noisy_input,
+                noise_level * self.time_embedding_scale,
+                all_text_context,
+                seq_len + self.chunk_size,
+                y=None,
+            )  # (B, C, T, 1, 1)
+            # Adjust using CFG
+            if self.cfg_scale != 1.0:
+                predicted_result_null = self.model(
+                    noisy_input,
+                    noise_level * self.time_embedding_scale,
+                    text_null_context,
+                    seq_len + self.chunk_size,
+                    y=None,
+                )  # (B, C, T, 1, 1)
+                predicted_result = [
+                    self.cfg_scale * pv - (self.cfg_scale - 1) * pvn
+                    for pv, pvn in zip(predicted_result, predicted_result_null)
+                ]
+            for i in range(batch_size):
+                predicted_result_i = predicted_result[i]  # (C, input_length, 1, 1)
+                if self.prediction_type == "vel":
+                    predicted_vel = predicted_result_i[:, start_index:end_index, ...]
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+                elif self.prediction_type == "x0":
+                    predicted_vel = (
+                        predicted_result_i[:, start_index:end_index, ...]
+                        - generated[i, :, start_index:end_index, ...]
+                    ) / (
+                        noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+                elif self.prediction_type == "noise":
+                    predicted_vel = (
+                        generated[i, :, start_index:end_index, ...]
+                        - predicted_result_i[:, start_index:end_index, ...]
+                    ) / (
+                        1
+                        + dt
+                        - noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+        generated = self.postprocess(generated)  # (B, T, C)
+        y_hat_out = []
+        for i in range(batch_size):
+            # cut off the padding
+            single_generated = generated[i, : generated_length[i], :]
+            y_hat_out.append(single_generated)
+        out = {}
+        out["generated"] = y_hat_out
+        out["text"] = full_text
+        return out
+    @torch.no_grad()
+    def stream_generate(self, x, num_denoise_steps=None):
+        """
+        Streaming generation - Diffusion Forcing inference
+        Uses triangular noise schedule, progressively generating from left to right
+        Generation process:
+        1. Start from t=0, gradually increase t
+        2. Each t corresponds to a noise schedule: clean on left, noisy on right, gradient in middle
+        3. After each denoising step, t increases slightly and continues
+        """
+        feature_length = x["feature_length"]
+        batch_size = len(feature_length)
+        seq_len = max(feature_length).item()
+        # # debug
+        # x["text"] = [["walk forward.", "sit down.", "stand up."] for _ in range(batch_size)]
+        # x["feature_text_end"] = [[1, 2, 3] for _ in range(batch_size)]
+        # text = x["text"]
+        # text_end = x["feature_text_end"]
+        # print(text)
+        # print(text_end)
+        # print(batch_size, seq_len, self.chunk_size)
+        if num_denoise_steps is None:
+            num_denoise_steps = self.noise_steps
+        assert num_denoise_steps % self.chunk_size == 0
+        device = next(self.parameters()).device
+        # Initialize entire sequence as pure noise
+        generated = torch.randn(
+            batch_size, seq_len + self.chunk_size, self.input_dim, device=device
+        )
+        generated = self.preprocess(generated)  # (B, C, T, 1, 1)
+        # Calculate total number of time steps needed
+        max_t = 1 + (seq_len - 1) / self.chunk_size
+        # Step size for each advancement
+        dt = 1 / num_denoise_steps
+        total_steps = int(max_t / dt)
+        # Encode text condition (using cache)
+        if self.use_text_cond and "text" in x:
+            text_list = x["text"]  # List[str] or List[List[str]]
+            if isinstance(text_list[0], list):
+                generated_length = []
+                text_end_list = x["feature_text_end"]
+                full_text = []
+                all_text_context = []
+                for single_text_list, single_text_end_list in zip(
+                    text_list, text_end_list
+                ):
+                    single_text_end_list = [0] + [
+                        min(t, seq_len) for t in single_text_end_list
+                    ]
+                    generated_length.append(single_text_end_list[-1])
+                    single_text_length_list = [
+                        t - b
+                        for t, b in zip(
+                            single_text_end_list[1:], single_text_end_list[:-1]
+                        )
+                    ]
+                    full_text.append(
+                        " ////////// ".join(
+                            [
+                                f"{u} //dur:{t}"
+                                for u, t in zip(
+                                    single_text_list, single_text_length_list
+                                )
+                            ]
+                        )
+                    )
+                    single_text_context = self.encode_text_with_cache(
+                        single_text_list, device
+                    )
+                    single_text_context = [
+                        u.to(self.param_dtype) for u in single_text_context
+                    ]
+                    for u, duration in zip(
+                        single_text_context, single_text_length_list
+                    ):
+                        all_text_context.extend([u for _ in range(duration)])
+                    all_text_context.extend(
+                        [
+                            single_text_context[-1]
+                            for _ in range(
+                                seq_len + self.chunk_size - single_text_end_list[-1]
+                            )
+                        ]
+                    )
+            else:
+                generated_length = feature_length
+                full_text = text_list
+                all_text_context = self.encode_text_with_cache(text_list, device)
+                all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        else:
+            generated_length = feature_length
+            full_text = [""] * batch_size
+            all_text_context = [""] * batch_size
+            all_text_context = self.encode_text_with_cache(all_text_context, device)
+            all_text_context = [u.to(self.param_dtype) for u in all_text_context]
+        # Get empty text condition encoding (for CFG)
+        text_null_list = [""] * batch_size
+        text_null_context = self.encode_text_with_cache(text_null_list, device)
+        text_null_context = [u.to(self.param_dtype) for u in text_null_context]
+        # print(len(all_text_context), len(text_null_context))
+        commit_index = 0
+        # Progressively advance from t=0 to t=max_t
+        for step in range(total_steps):
+            # Current time step
+            t = step * dt
+            start_index = max(0, int(self.chunk_size * (t - 1)) + 1)
+            end_index = int(self.chunk_size * t) + 1
+            time_steps = torch.full((batch_size,), t, device=device)
+            # Calculate current noise schedule
+            noise_level = self._get_noise_levels(
+                device, seq_len + self.chunk_size, time_steps
+            )  # (B, T)
+            # Predict noise through WanModel
+            noisy_input = []
+            for i in range(batch_size):
+                noisy_input.append(generated[i, :, :end_index, ...])
+            predicted_result = self.model(
+                noisy_input,
+                noise_level * self.time_embedding_scale,
+                all_text_context,
+                seq_len + self.chunk_size,
+                y=None,
+            )  # (B, C, T, 1, 1)
+            # Adjust using CFG
+            if self.cfg_scale != 1.0:
+                predicted_result_null = self.model(
+                    noisy_input,
+                    noise_level * self.time_embedding_scale,
+                    text_null_context,
+                    seq_len + self.chunk_size,
+                    y=None,
+                )  # (B, C, T, 1, 1)
+                predicted_result = [
+                    self.cfg_scale * pv - (self.cfg_scale - 1) * pvn
+                    for pv, pvn in zip(predicted_result, predicted_result_null)
+                ]
+            for i in range(batch_size):
+                predicted_result_i = predicted_result[i]  # (C, input_length, 1, 1)
+                if self.prediction_type == "vel":
+                    predicted_vel = predicted_result_i[:, start_index:end_index, ...]
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+                elif self.prediction_type == "x0":
+                    predicted_vel = (
+                        predicted_result_i[:, start_index:end_index, ...]
+                        - generated[i, :, start_index:end_index, ...]
+                    ) / (
+                        noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+                elif self.prediction_type == "noise":
+                    predicted_vel = (
+                        generated[i, :, start_index:end_index, ...]
+                        - predicted_result_i[:, start_index:end_index, ...]
+                    ) / (
+                        1
+                        + dt
+                        - noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    generated[i, :, start_index:end_index, ...] += predicted_vel * dt
+            if commit_index < start_index:
+                output = generated[:, :, commit_index:start_index, ...]
+                output = self.postprocess(output)  # (B, T, C)
+                y_hat_out = []
+                for i in range(batch_size):
+                    if commit_index < generated_length[i]:
+                        y_hat_out.append(
+                            output[i, : generated_length[i] - commit_index, ...]
+                        )
+                    else:
+                        y_hat_out.append(None)
+                out = {}
+                out["generated"] = y_hat_out
+                yield out
+                commit_index = start_index
+        output = generated[:, :, commit_index:, ...]
+        output = self.postprocess(output)  # (B, T_remain, C)
+        y_hat_out = []
+        for i in range(batch_size):
+            if commit_index < generated_length[i]:
+                y_hat_out.append(output[i, : generated_length[i] - commit_index, ...])
+            else:
+                y_hat_out.append(None)
+        out = {}
+        out["generated"] = y_hat_out
+        yield out
+    def init_generated(self, seq_len, batch_size=1, num_denoise_steps=None):
+        self.seq_len = seq_len
+        self.batch_size = batch_size
+        if num_denoise_steps is None:
+            self.num_denoise_steps = self.noise_steps
+        else:
+            self.num_denoise_steps = num_denoise_steps
+        assert self.num_denoise_steps % self.chunk_size == 0
+        self.dt = 1 / self.num_denoise_steps
+        self.current_step = 0
+        self.text_condition_list = [[] for _ in range(self.batch_size)]
+        self.generated = torch.randn(
+            self.batch_size, self.seq_len * 2 + self.chunk_size, self.input_dim
+        )
+        self.generated = self.preprocess(self.generated)  # (B, C, T, 1, 1)
+        self.commit_index = 0
+    @torch.no_grad()
+    def stream_generate_step(self, x, first_chunk=True):
+        """
+        Streaming generation step - Diffusion Forcing inference
+        Uses triangular noise schedule, progressively generating from left to right
+        Generation process:
+        1. Start from t=0, gradually increase t
+        2. Each t corresponds to a noise schedule: clean on left, noisy on right, gradient in middle
+        3. After each denoising step, t increases slightly and continues
+        """
+        device = next(self.parameters()).device
+        if first_chunk:
+            self.generated = self.generated.to(device)
+        # Encode text condition (using cache)
+        if self.use_text_cond and "text" in x:
+            text_list = x["text"]  # List[str]
+            new_text_context = self.encode_text_with_cache(text_list, device)
+            new_text_context = [u.to(self.param_dtype) for u in new_text_context]
+        else:
+            new_text_context = [""] * self.batch_size
+            new_text_context = self.encode_text_with_cache(new_text_context, device)
+            new_text_context = [u.to(self.param_dtype) for u in new_text_context]
+        # Get empty text condition encoding (for CFG)
+        text_null_list = [""] * self.batch_size
+        text_null_context = self.encode_text_with_cache(text_null_list, device)
+        text_null_context = [u.to(self.param_dtype) for u in text_null_context]
+        for i in range(self.batch_size):
+            if first_chunk:
+                self.text_condition_list[i].extend(
+                    [new_text_context[i]] * self.chunk_size
+                )
+            else:
+                self.text_condition_list[i].extend([new_text_context[i]])
+        end_step = (
+            (self.commit_index + self.chunk_size)
+            * self.num_denoise_steps
+            / self.chunk_size
+        )
+        while self.current_step < end_step:
+            current_time = self.current_step * self.dt
+            start_index = max(0, int(self.chunk_size * (current_time - 1)) + 1)
+            end_index = int(self.chunk_size * current_time) + 1
+            time_steps = torch.full((self.batch_size,), current_time, device=device)
+            noise_level = self._get_noise_levels(device, end_index, time_steps)[
+                :, -self.seq_len :
+            ]  # (B, T)
+            # Predict noise through WanModel
+            noisy_input = []
+            for i in range(self.batch_size):
+                noisy_input.append(
+                    self.generated[i, :, :end_index, ...][:, -self.seq_len :]
+                )  # (C, T, 1, 1)
+            text_condition = []
+            for i in range(self.batch_size):
+                text_condition.extend(
+                    self.text_condition_list[i][:end_index][-self.seq_len :]
+                )  # (T, D, 4096)
+            # print("////////////////////")
+            # print("current step: ", self.current_step)
+            # print("chunk size: ", self.chunk_size)
+            # print("start_index: ", start_index)
+            # print("end_index: ", end_index)
+            # print("noisy_input shape: ", noisy_input[0].shape)
+            # print("noise_level: ", noise_level[0, start_index:end_index])
+            # print("text_condition shape: ", len(text_condition))
+            # print("commit_index: ", self.commit_index)
+            # print("////////////////////")
+            predicted_result = self.model(
+                noisy_input,
+                noise_level * self.time_embedding_scale,
+                text_condition,
+                min(end_index, self.seq_len),
+                y=None,
+            )  # (B, C, T, 1, 1)
+            # Adjust using CFG
+            if self.cfg_scale != 1.0:
+                predicted_result_null = self.model(
+                    noisy_input,
+                    noise_level * self.time_embedding_scale,
+                    text_null_context,
+                    min(end_index, self.seq_len),
+                    y=None,
+                )  # (B, C, T, 1, 1)
+                predicted_result = [
+                    self.cfg_scale * pv - (self.cfg_scale - 1) * pvn
+                    for pv, pvn in zip(predicted_result, predicted_result_null)
+                ]
+            for i in range(self.batch_size):
+                predicted_result_i = predicted_result[i]  # (C, input_length, 1, 1)
+                if end_index > self.seq_len:
+                    predicted_result_i = torch.cat(
+                        [
+                            torch.zeros(
+                                predicted_result_i.shape[0],
+                                end_index - self.seq_len,
+                                predicted_result_i.shape[2],
+                                predicted_result_i.shape[3],
+                                device=device,
+                            ),
+                            predicted_result_i,
+                        ],
+                        dim=1,
+                    )
+                if self.prediction_type == "vel":
+                    predicted_vel = predicted_result_i[:, start_index:end_index, ...]
+                    self.generated[i, :, start_index:end_index, ...] += (
+                        predicted_vel * self.dt
+                    )
+                elif self.prediction_type == "x0":
+                    predicted_vel = (
+                        predicted_result_i[:, start_index:end_index, ...]
+                        - self.generated[i, :, start_index:end_index, ...]
+                    ) / (
+                        noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    self.generated[i, :, start_index:end_index, ...] += (
+                        predicted_vel * self.dt
+                    )
+                elif self.prediction_type == "noise":
+                    predicted_vel = (
+                        self.generated[i, :, start_index:end_index, ...]
+                        - predicted_result_i[:, start_index:end_index, ...]
+                    ) / (
+                        1
+                        + self.dt
+                        - noise_level[i, start_index:end_index]
+                        .unsqueeze(0)
+                        .unsqueeze(-1)
+                        .unsqueeze(-1)
+                    )
+                    self.generated[i, :, start_index:end_index, ...] += (
+                        predicted_vel * self.dt
+                    )
+            self.current_step += 1
+        output = self.generated[:, :, self.commit_index : self.commit_index + 1, ...]
+        output = self.postprocess(output)  # (B, 1, C)
+        out = {}
+        out["generated"] = output
+        self.commit_index += 1
+        if self.commit_index == self.seq_len * 2:
+            self.generated = torch.cat(
+                [
+                    self.generated[:, :, self.seq_len :, ...],
+                    torch.randn(
+                        self.batch_size,
+                        self.input_dim,
+                        self.seq_len,
+                        1,
+                        1,
+                        device=device,
+                    ),
+                ],
+                dim=2,
+            )
+            self.current_step -= self.seq_len * self.num_denoise_steps / self.chunk_size
+            self.commit_index -= self.seq_len
+            for i in range(self.batch_size):
+                self.text_condition_list[i] = self.text_condition_list[i][
+                    self.seq_len :
+                ]
+        return out

ldf_models/tools/attention.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+import warnings
+__all__ = [
+    "flash_attention",
+    "attention",
+]
+def flash_attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    version=None,
+):
+    """
+    q:              [B, Lq, Nq, C1].
+    k:              [B, Lk, Nk, C1].
+    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
+    q_lens:         [B].
+    k_lens:         [B].
+    dropout_p:      float. Dropout probability.
+    softmax_scale:  float. The scaling of QK^T before applying softmax.
+    causal:         bool. Whether to apply causal attention mask.
+    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
+    deterministic:  bool. If True, slightly slower and uses more memory.
+    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
+    """
+    half_dtypes = (torch.float16, torch.bfloat16)
+    assert dtype in half_dtypes
+    assert q.device.type == "cuda" and q.size(-1) <= 256
+    # params
+    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # preprocess query
+    if q_lens is None:
+        q = half(q.flatten(0, 1))
+        q_lens = torch.tensor([lq] * b, dtype=torch.int32).to(
+            device=q.device, non_blocking=True
+        )
+    else:
+        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
+    # preprocess key, value
+    if k_lens is None:
+        k = half(k.flatten(0, 1))
+        v = half(v.flatten(0, 1))
+        k_lens = torch.tensor([lk] * b, dtype=torch.int32).to(
+            device=k.device, non_blocking=True
+        )
+    else:
+        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
+        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
+    q = q.to(v.dtype)
+    k = k.to(v.dtype)
+    if q_scale is not None:
+        q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
+        warnings.warn(
+            "Flash attention 3 is not available, use flash attention 2 instead."
+        )
+    # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
+        # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            seqused_q=None,
+            seqused_k=None,
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            deterministic=deterministic,
+        )[0].unflatten(0, (b, lq))
+    else:
+        assert FLASH_ATTN_2_AVAILABLE
+        x = flash_attn.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens])
+            .cumsum(0, dtype=torch.int32)
+            .to(q.device, non_blocking=True),
+            max_seqlen_q=lq,
+            max_seqlen_k=lk,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+        ).unflatten(0, (b, lq))
+    # output
+    return x.type(out_dtype)
+def attention(
+    q,
+    k,
+    v,
+    q_lens=None,
+    k_lens=None,
+    dropout_p=0.0,
+    softmax_scale=None,
+    q_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    deterministic=False,
+    dtype=torch.bfloat16,
+    fa_version=None,
+):
+    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
+        return flash_attention(
+            q=q,
+            k=k,
+            v=v,
+            q_lens=q_lens,
+            k_lens=k_lens,
+            dropout_p=dropout_p,
+            softmax_scale=softmax_scale,
+            q_scale=q_scale,
+            causal=causal,
+            window_size=window_size,
+            deterministic=deterministic,
+            dtype=dtype,
+            version=fa_version,
+        )
+    else:
+        if q_lens is not None or k_lens is not None:
+            warnings.warn(
+                "Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance."
+            )
+        attn_mask = None
+        q = q.transpose(1, 2).to(dtype)
+        k = k.transpose(1, 2).to(dtype)
+        v = v.transpose(1, 2).to(dtype)
+        out = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
+        )
+        out = out.transpose(1, 2).contiguous()
+        return out

ldf_models/tools/t5.py ADDED Viewed

	@@ -0,0 +1,564 @@

+# Modified from transformers.models.t5.modeling_t5
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .tokenizers import HuggingfaceTokenizer
+__all__ = [
+    "T5Model",
+    "T5Encoder",
+    "T5Decoder",
+    "T5EncoderModel",
+]
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5Model):
+        nn.init.normal_(m.token_embedding.weight, std=1.0)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn) ** -0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn) ** -0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads) ** -0.5
+        )
+class GELU(nn.Module):
+    def forward(self, x):
+        return (
+            0.5
+            * x
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
+                )
+            )
+        )
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) + self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1, -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum("binc,bjnc->bnij", q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum("bnij,bjnc->binc", attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = (
+            None
+            if shared_pos
+            else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+        )
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5CrossAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm3 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = (
+            None
+            if shared_pos
+            else T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False)
+        )
+    def forward(
+        self, x, mask=None, encoder_states=None, encoder_mask=None, pos_bias=None
+    ):
+        e = pos_bias if self.shared_pos else self.pos_embedding(x.size(1), x.size(1))
+        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(
+            x
+            + self.cross_attn(self.norm2(x), context=encoder_states, mask=encoder_mask)
+        )
+        x = fp16_clamp(x + self.ffn(self.norm3(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - torch.arange(
+            lq, device=device
+        ).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = (
+            max_exact
+            + (
+                torch.log(rel_pos.float() / max_exact)
+                / math.log(self.max_dist / max_exact)
+                * (num_buckets - max_exact)
+            ).long()
+        )
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1)
+        )
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+class T5Encoder(nn.Module):
+    def __init__(
+        self,
+        vocab,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_layers,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5Encoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = (
+            vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        )
+        self.pos_embedding = (
+            T5RelativeEmbedding(num_buckets, num_heads, bidirectional=True)
+            if shared_pos
+            else None
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5SelfAttention(
+                    dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Decoder(nn.Module):
+    def __init__(
+        self,
+        vocab,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        num_layers,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5Decoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = (
+            vocab if isinstance(vocab, nn.Embedding) else nn.Embedding(vocab, dim)
+        )
+        self.pos_embedding = (
+            T5RelativeEmbedding(num_buckets, num_heads, bidirectional=False)
+            if shared_pos
+            else None
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList(
+            [
+                T5CrossAttention(
+                    dim, dim_attn, dim_ffn, num_heads, num_buckets, shared_pos, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
+        b, s = ids.size()
+        # causal mask
+        if mask is None:
+            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
+        elif mask.ndim == 2:
+            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
+        # layers
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1), x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+class T5Model(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        dim,
+        dim_attn,
+        dim_ffn,
+        num_heads,
+        encoder_layers,
+        decoder_layers,
+        num_buckets,
+        shared_pos=True,
+        dropout=0.1,
+    ):
+        super(T5Model, self).__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_buckets = num_buckets
+        # layers
+        self.token_embedding = nn.Embedding(vocab_size, dim)
+        self.encoder = T5Encoder(
+            self.token_embedding,
+            dim,
+            dim_attn,
+            dim_ffn,
+            num_heads,
+            encoder_layers,
+            num_buckets,
+            shared_pos,
+            dropout,
+        )
+        self.decoder = T5Decoder(
+            self.token_embedding,
+            dim,
+            dim_attn,
+            dim_ffn,
+            num_heads,
+            decoder_layers,
+            num_buckets,
+            shared_pos,
+            dropout,
+        )
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
+        x = self.encoder(encoder_ids, encoder_mask)
+        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
+        x = self.head(x)
+        return x
+def _t5(
+    name,
+    encoder_only=False,
+    decoder_only=False,
+    return_tokenizer=False,
+    tokenizer_kwargs={},
+    dtype=torch.float32,
+    device="cpu",
+    **kwargs,
+):
+    # sanity check
+    assert not (encoder_only and decoder_only)
+    # params
+    if encoder_only:
+        model_cls = T5Encoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("encoder_layers")
+        _ = kwargs.pop("decoder_layers")
+    elif decoder_only:
+        model_cls = T5Decoder
+        kwargs["vocab"] = kwargs.pop("vocab_size")
+        kwargs["num_layers"] = kwargs.pop("decoder_layers")
+        _ = kwargs.pop("encoder_layers")
+    else:
+        model_cls = T5Model
+    # init model
+    with torch.device(device):
+        model = model_cls(**kwargs)
+    # set device
+    model = model.to(dtype=dtype, device=device)
+    # init tokenizer
+    if return_tokenizer:
+        from .tokenizers import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(f"google/{name}", **tokenizer_kwargs)
+        return model, tokenizer
+    else:
+        return model
+def umt5_xxl(**kwargs):
+    cfg = dict(
+        vocab_size=256384,
+        dim=4096,
+        dim_attn=4096,
+        dim_ffn=10240,
+        num_heads=64,
+        encoder_layers=24,
+        decoder_layers=24,
+        num_buckets=32,
+        shared_pos=False,
+        dropout=0.1,
+    )
+    cfg.update(**kwargs)
+    return _t5("umt5-xxl", **cfg)
+class T5EncoderModel:
+    def __init__(
+        self,
+        text_len,
+        dtype=torch.bfloat16,
+        device=torch.cuda.current_device(),
+        checkpoint_path=None,
+        tokenizer_path=None,
+        shard_fn=None,
+    ):
+        self.text_len = text_len
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.tokenizer_path = tokenizer_path
+        # init model
+        model = (
+            umt5_xxl(
+                encoder_only=True, return_tokenizer=False, dtype=dtype, device=device
+            )
+            .eval()
+            .requires_grad_(False)
+        )
+        logging.info(f"loading {checkpoint_path}")
+        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
+        self.model = model
+        if shard_fn is not None:
+            self.model = shard_fn(self.model, sync_module_states=False)
+        else:
+            self.model.to(self.device)
+        # init tokenizer
+        self.tokenizer = HuggingfaceTokenizer(
+            name=tokenizer_path, seq_len=text_len, clean="whitespace"
+        )
+    def __call__(self, texts, device):
+        ids, mask = self.tokenizer(texts, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        context = self.model(ids, mask)
+        return [u[:v] for u, v in zip(context, seq_lens)]

ldf_models/tools/tokenizers.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import html
+import string
+import ftfy
+import regex as re
+from transformers import AutoTokenizer
+__all__ = ["HuggingfaceTokenizer"]
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, "whitespace", "lower", "canonicalize")
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop("return_mask", False)
+        # arguments
+        _kwargs = {"return_tensors": "pt"}
+        if self.seq_len is not None:
+            _kwargs.update(
+                {
+                    "padding": "max_length",
+                    "truncation": True,
+                    "max_length": self.seq_len,
+                }
+            )
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == "whitespace":
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == "lower":
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == "canonicalize":
+            text = canonicalize(basic_clean(text))
+        return text

ldf_models/tools/wan_model.py ADDED Viewed

	@@ -0,0 +1,592 @@

+# This module uses modified code from Alibaba Wan Team
+# Original source: https://github.com/Wan-Video/Wan2.2
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+# Modified to support stream mode for cross-attention.
+# Added causal attention for self-attention (1d case)
+# Added context length corrrection.
+import math
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from .attention import flash_attention
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float64)
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half))
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+@torch.amp.autocast("cuda", enabled=False)
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float64).div(dim)),
+    )
+    freqs = torch.polar(torch.ones_like(freqs), freqs)
+    return freqs
+@torch.amp.autocast("cuda", enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    n, c = x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(
+            x[i, :seq_len].to(torch.float64).reshape(seq_len, n, -1, 2)
+        )
+        freqs_i = torch.cat(
+            [
+                freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        ).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[i, seq_len:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+class WanRMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return self._norm(x.float()).type_as(x) * self.weight
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+class WanLayerNorm(nn.LayerNorm):
+    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
+        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+        """
+        return super().forward(x.float()).type_as(x)
+class WanSelfAttention(nn.Module):
+    def __init__(
+        self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6, causal=False
+    ):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+        self.causal = causal
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
+    def forward(self, x, seq_lens, grid_sizes, freqs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            seq_lens(Tensor): Shape [B]
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n, d)
+            return q, k, v
+        q, k, v = qkv_fn(x)
+        x = flash_attention(
+            q=rope_apply(q, grid_sizes, freqs),
+            k=rope_apply(k, grid_sizes, freqs),
+            v=v,
+            k_lens=seq_lens,
+            window_size=self.window_size,
+            causal=self.causal,
+        )
+        # output
+        x = x.flatten(2)
+        x = self.o(x)
+        return x
+class WanCrossAttention(WanSelfAttention):
+    def forward(self, x, context, context_lens):
+        r"""
+        Args non-stream mode:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+            context_lens(Tensor): Shape [B]
+        Args stream mode:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [BxL1, L2, C]
+            context_lens(Tensor): Shape [BxL1]
+        """
+        out_sizes = x.size()
+        b, n, d = context.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.norm_q(self.q(x)).view(b, -1, n, d)
+        k = self.norm_k(self.k(context)).view(b, -1, n, d)
+        v = self.v(context).view(b, -1, n, d)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        # output
+        x = x.flatten(2).view(*out_sizes)
+        x = self.o(x)
+        return x
+class WanAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        ffn_dim,
+        num_heads,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=False,
+        eps=1e-6,
+        causal=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.causal = causal
+        # layers
+        self.norm1 = WanLayerNorm(dim, eps)
+        self.self_attn = WanSelfAttention(
+            dim, num_heads, window_size, qk_norm, eps, causal
+        )
+        self.norm3 = (
+            WanLayerNorm(dim, eps, elementwise_affine=True)
+            if cross_attn_norm
+            else nn.Identity()
+        )
+        self.cross_attn = WanCrossAttention(dim, num_heads, (-1, -1), qk_norm, eps)
+        self.norm2 = WanLayerNorm(dim, eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        x,
+        e,
+        seq_lens,
+        grid_sizes,
+        freqs,
+        context,
+        context_lens,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, L1, 6, C]
+            seq_lens(Tensor): Shape [B], length of each sequence in batch
+            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        assert e.dtype == torch.float32
+        with torch.amp.autocast("cuda", dtype=torch.float32):
+            e = (self.modulation.unsqueeze(0) + e).chunk(6, dim=2)
+        assert e[0].dtype == torch.float32
+        # self-attention
+        y = self.self_attn(
+            self.norm1(x).float() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
+            seq_lens,
+            grid_sizes,
+            freqs,
+        )
+        with torch.amp.autocast("cuda", dtype=torch.float32):
+            x = x + y * e[2].squeeze(2)
+        # cross-attention & ffn function
+        def cross_attn_ffn(x, context, context_lens, e):
+            x = x + self.cross_attn(self.norm3(x), context, context_lens)
+            y = self.ffn(
+                self.norm2(x).float() * (1 + e[4].squeeze(2)) + e[3].squeeze(2)
+            )
+            with torch.amp.autocast("cuda", dtype=torch.float32):
+                x = x + y * e[5].squeeze(2)
+            return x
+        x = cross_attn_ffn(x, context, context_lens, e)
+        return x
+class Head(nn.Module):
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = WanLayerNorm(dim, eps)
+        self.head = nn.Linear(dim, out_dim)
+        # modulation
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, L1, C]
+        """
+        assert e.dtype == torch.float32
+        with torch.amp.autocast("cuda", dtype=torch.float32):
+            e = (self.modulation.unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2)
+            x = self.head(self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2))
+        return x
+class WanModel(ModelMixin, ConfigMixin):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+    ignore_for_config = [
+        "patch_size",
+        "cross_attn_norm",
+        "qk_norm",
+        "text_dim",
+        "window_size",
+    ]
+    _no_split_modules = ["WanAttentionBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        model_type="t2v",
+        patch_size=(1, 2, 2),
+        text_len=512,
+        in_dim=16,
+        dim=2048,
+        ffn_dim=8192,
+        freq_dim=256,
+        text_dim=4096,
+        out_dim=16,
+        num_heads=16,
+        num_layers=32,
+        window_size=(-1, -1),
+        qk_norm=True,
+        cross_attn_norm=True,
+        eps=1e-6,
+        causal=False,
+    ):
+        r"""
+        Initialize the diffusion model backbone.
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+        super().__init__()
+        assert model_type in ["t2v", "i2v", "ti2v", "s2v"]
+        self.model_type = model_type
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+        self.causal = causal
+        # embeddings
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
+        # blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanAttentionBlock(
+                    dim,
+                    ffn_dim,
+                    num_heads,
+                    window_size,
+                    qk_norm,
+                    cross_attn_norm,
+                    eps,
+                    causal,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps)
+        # buffers (don't use register_buffer otherwise dtype will be changed in to())
+        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
+        d = dim // num_heads
+        self.freqs = torch.cat(
+            [
+                rope_params(1024, d - 4 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+                rope_params(1024, 2 * (d // 6)),
+            ],
+            dim=1,
+        )
+        # initialize weights
+        self.init_weights()
+    def forward(
+        self,
+        x,
+        t,
+        context,
+        seq_len,
+        y=None,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if self.model_type == "i2v":
+            assert y is not None
+        # params
+        device = self.patch_embedding.weight.device
+        if self.freqs.device != device:
+            self.freqs = self.freqs.to(device)
+        if y is not None:
+            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+        # embeddings
+        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+        grid_sizes = torch.stack(
+            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x]
+        )
+        x = [u.flatten(2).transpose(1, 2) for u in x]
+        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+        assert seq_lens.max() <= seq_len
+        x = torch.cat(
+            [
+                torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+                for u in x
+            ]
+        )
+        # time embeddings
+        if t.dim() == 1:  # bs
+            t = t.expand(t.size(0), seq_len)
+        with torch.amp.autocast("cuda", dtype=torch.float32):
+            bt = t.size(0)
+            t = t.flatten()
+            e = self.time_embedding(
+                sinusoidal_embedding_1d(self.freq_dim, t)
+                .unflatten(0, (bt, seq_len))
+                .float()
+            )
+            e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+            assert e.dtype == torch.float32 and e0.dtype == torch.float32
+        # context
+        context_lens = torch.tensor([u.size(0) for u in context], dtype=torch.long)
+        context = self.text_embedding(
+            torch.stack(
+                [
+                    torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+                    for u in context
+                ]
+            )
+        )
+        # arguments
+        kwargs = dict(
+            e=e0,
+            seq_lens=seq_lens,
+            grid_sizes=grid_sizes,
+            freqs=self.freqs,
+            context=context,
+            context_lens=context_lens,
+        )
+        for block in self.blocks:
+            x = block(x, **kwargs)
+        # head
+        x = self.head(x, e)
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return [u.float() for u in x]
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
+        """
+        c = self.out_dim
+        out = []
+        for u, v in zip(x, grid_sizes.tolist()):
+            u = u[: math.prod(v)].view(*v, *self.patch_size, c)
+            u = torch.einsum("fhwpqrc->cfphqwr", u)
+            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
+            out.append(u)
+        return out
+    def init_weights(self):
+        r"""
+        Initialize model parameters using Xavier initialization.
+        """
+        # basic init
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+        # init embeddings
+        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
+        for m in self.text_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        for m in self.time_embedding.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, std=0.02)
+        # init output layer
+        nn.init.zeros_(self.head.head.weight)

ldf_models/tools/wan_vae_1d.py ADDED Viewed

	@@ -0,0 +1,762 @@

+# This module uses modified code from Alibaba Wan Team
+# Original source: https://github.com/Wan-Video/Wan2.2
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+# Modified to support 1d features with (B, C, T)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+CACHE_T = 2
+class CausalConv1d(nn.Conv1d):
+    """
+    Causal 1d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0,)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[0] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[0] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1,)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return (
+            F.normalize(x, dim=(1 if self.channel_first else -1))
+            * self.scale
+            * self.gamma
+            + self.bias
+        )
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "upsample1d",
+            "downsample1d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample1d":
+            self.time_conv = CausalConv1d(dim, dim * 2, (3,), padding=(1,))
+        elif mode == "downsample1d":
+            self.time_conv = CausalConv1d(dim, dim, (3,), stride=(2,), padding=(0,))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t = x.size()
+        if self.mode == "upsample1d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:].clone()
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] != "Rep"
+                    ):
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1]
+                                .unsqueeze(2)
+                                .to(cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] == "Rep"
+                    ):
+                        cache_x = torch.cat(
+                            [torch.zeros_like(cache_x).to(cache_x.device), cache_x],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t)
+                    x = torch.stack((x[:, 0, :, :], x[:, 1, :, :]), 3)
+                    x = x.reshape(b, c, t * 2)
+        if self.mode == "downsample1d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim),
+            nn.SiLU(),
+            CausalConv1d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv1d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = (
+            CausalConv1d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv1d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AvgDown1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor = self.factor_t
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+        )
+        x = x.permute(0, 1, 3, 2).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+        )
+        x = x.mean(dim=2)
+        return x
+class DupUp1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor = self.factor_t
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            x.size(2),
+        )
+        x = x.permute(0, 1, 3, 2).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+        )
+        if first_chunk:
+            x = x[
+                :,
+                :,
+                self.factor_t - 1 :,
+            ]
+        return x
+class Down_ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_downsample=False):
+        super().__init__()
+        # Shortcut path with downsample
+        if temperal_downsample:
+            self.avg_shortcut = AvgDown1D(
+                in_dim,
+                out_dim,
+                factor_t=2,
+            )
+        else:
+            self.avg_shortcut = None
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final downsample block
+        if temperal_downsample:
+            downsamples.append(Resample(out_dim, mode="downsample1d"))
+        self.downsamples = nn.Sequential(*downsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+        if self.avg_shortcut is None:
+            return x
+        else:
+            return x + self.avg_shortcut(x_copy)
+class Up_ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_upsample=False):
+        super().__init__()
+        # Shortcut path with upsample
+        if temperal_upsample:
+            self.avg_shortcut = DupUp1D(
+                in_dim,
+                out_dim,
+                factor_t=2,
+            )
+        else:
+            self.avg_shortcut = None
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final upsample block
+        if temperal_upsample:
+            upsamples.append(Resample(out_dim, mode="upsample1d"))
+        self.upsamples = nn.Sequential(*upsamples)
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+class Encoder1d(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv1d(input_dim, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = (
+                temperal_downsample[i] if i < len(temperal_downsample) else False
+            )
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                )
+            )
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            RMS_norm(out_dim),
+            CausalConv1d(out_dim, out_dim, 1),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim),
+            nn.SiLU(),
+            CausalConv1d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv1d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder1d(nn.Module):
+    def __init__(
+        self,
+        output_dim,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv1d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            RMS_norm(dims[0]),
+            CausalConv1d(dims[0], dims[0], 1),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks + 1,
+                    temperal_upsample=t_up_flag,
+                )
+            )
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim),
+            nn.SiLU(),
+            CausalConv1d(out_dim, output_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv1d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv1d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv1d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        dim=160,
+        dec_dim=256,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=1,
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder1d(
+            input_dim,
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv1d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv1d(z_dim, z_dim, 1)
+        self.decoder = Decoder1d(
+            input_dim,
+            dec_dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            self.temperal_upsample,
+            dropout,
+        )
+    def forward(self, x, scale=[0, 1]):
+        mu = self.encode(x, scale)
+        x_recon = self.decode(mu, scale)
+        return x_recon, mu
+    def encode(self, x, scale, return_dist=False):
+        self.clear_cache()
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1)) * scale[1].view(
+                1, self.z_dim, 1
+            )
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        if return_dist:
+            return mu, log_var
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1) + scale[0].view(1, self.z_dim, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=True,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+    @torch.no_grad()
+    def stream_encode(self, x, first_chunk, scale, return_dist=False):
+        t = x.shape[2]
+        if first_chunk:
+            iter_ = 1 + (t - 1) // 4
+        else:
+            iter_ = t // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                if first_chunk:
+                    out = self.encoder(
+                        x[:, :, :1],
+                        feat_cache=self._enc_feat_map,
+                        feat_idx=self._enc_conv_idx,
+                    )
+                else:
+                    out = self.encoder(
+                        x[:, :, :4],
+                        feat_cache=self._enc_feat_map,
+                        feat_idx=self._enc_conv_idx,
+                    )
+            else:
+                if first_chunk:
+                    out_ = self.encoder(
+                        x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i],
+                        feat_cache=self._enc_feat_map,
+                        feat_idx=self._enc_conv_idx,
+                    )
+                else:
+                    out_ = self.encoder(
+                        x[:, :, 4 * i : 4 * (i + 1)],
+                        feat_cache=self._enc_feat_map,
+                        feat_idx=self._enc_conv_idx,
+                    )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1)) * scale[1].view(
+                1, self.z_dim, 1
+            )
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        if return_dist:
+            return mu, log_var
+        else:
+            return mu
+    @torch.no_grad()
+    def stream_decode(self, z, first_chunk, scale):
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1) + scale[0].view(1, self.z_dim, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=first_chunk,  # Use the external first_chunk parameter
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=False,  # Explicitly set to False for subsequent time steps within the same chunk
+                )
+                out = torch.cat([out, out_], 2)
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv1d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv1d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num

ldf_models/vae_wan_1d.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from .tools.wan_vae_1d import WanVAE_
+class VAEWanModel(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        mean_path=None,
+        std_path=None,
+        z_dim=256,
+        dim=160,
+        dec_dim=512,
+        num_res_blocks=1,
+        dropout=0.0,
+        dim_mult=[1, 1, 1],
+        temperal_downsample=[True, True],
+        vel_window=[0, 0],
+        **kwargs,
+    ):
+        super().__init__()
+        self.mean_path = mean_path
+        self.std_path = std_path
+        self.input_dim = input_dim
+        self.z_dim = z_dim
+        self.dim = dim
+        self.dec_dim = dec_dim
+        self.num_res_blocks = num_res_blocks
+        self.dropout = dropout
+        self.dim_mult = dim_mult
+        self.temperal_downsample = temperal_downsample
+        self.vel_window = vel_window
+        self.RECONS_LOSS = nn.SmoothL1Loss()
+        self.LAMBDA_FEATURE = kwargs.get("LAMBDA_FEATURE", 1.0)
+        self.LAMBDA_VELOCITY = kwargs.get("LAMBDA_VELOCITY", 0.5)
+        self.LAMBDA_KL = kwargs.get("LAMBDA_KL", 10e-6)
+        if self.mean_path is not None:
+            self.register_buffer(
+                "mean", torch.from_numpy(np.load(self.mean_path)).float()
+            )
+        else:
+            self.register_buffer("mean", torch.zeros(input_dim))
+        if self.std_path is not None:
+            self.register_buffer(
+                "std", torch.from_numpy(np.load(self.std_path)).float()
+            )
+        else:
+            self.register_buffer("std", torch.ones(input_dim))
+        self.model = WanVAE_(
+            input_dim=self.input_dim,
+            dim=self.dim,
+            dec_dim=self.dec_dim,
+            z_dim=self.z_dim,
+            dim_mult=self.dim_mult,
+            num_res_blocks=self.num_res_blocks,
+            temperal_downsample=self.temperal_downsample,
+            dropout=self.dropout,
+        )
+        downsample_factor = 1
+        for flag in self.temperal_downsample:
+            if flag:
+                downsample_factor *= 2
+        self.downsample_factor = downsample_factor
+    def preprocess(self, x):
+        # (bs, T, C) -> (bs, C, T)
+        x = x.permute(0, 2, 1)
+        return x
+    def postprocess(self, x):
+        # (bs, C, T) ->  (bs, T, C)
+        x = x.permute(0, 2, 1)
+        return x
+    def forward(self, x):
+        features = x["feature"]
+        feature_length = x["feature_length"]
+        features = (features - self.mean) / self.std
+        # create mask based on feature_length
+        batch_size, seq_len = features.shape[:2]
+        mask = torch.zeros(
+            batch_size, seq_len, dtype=torch.bool, device=features.device
+        )
+        for i in range(batch_size):
+            mask[i, : feature_length[i]] = True
+        x_in = self.preprocess(features)  # (bs, input_dim, T)
+        mu, log_var = self.model.encode(
+            x_in, scale=[0, 1], return_dist=True
+        )  # (bs, z_dim, T)
+        z = self.model.reparameterize(mu, log_var)
+        x_decoder = self.model.decode(z, scale=[0, 1])  # (bs, input_dim, T)
+        x_out = self.postprocess(x_decoder)  # (bs, T, input_dim)
+        if x_out.size(1) != features.size(1):
+            min_len = min(x_out.size(1), features.size(1))
+            x_out = x_out[:, :min_len, :]
+            features = features[:, :min_len, :]
+            mask = mask[:, :min_len]
+        mask_expanded = mask.unsqueeze(-1)
+        x_out_masked = x_out * mask_expanded
+        features_masked = features * mask_expanded
+        loss_recons = self.RECONS_LOSS(x_out_masked, features_masked)
+        vel_start = self.vel_window[0]
+        vel_end = self.vel_window[1]
+        loss_vel = self.RECONS_LOSS(
+            x_out_masked[..., vel_start:vel_end],
+            features_masked[..., vel_start:vel_end],
+        )
+        # Compute KL divergence loss
+        # KL(N(mu, sigma) || N(0, 1)) = -0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
+        # log_var = log(sigma^2), so we can use it directly
+        # Build mask for latent space
+        T_latent = mu.size(2)
+        mask_downsampled = torch.zeros(
+            batch_size, T_latent, dtype=torch.bool, device=features.device
+        )
+        for i in range(batch_size):
+            latent_length = (
+                feature_length[i] + self.downsample_factor - 1
+            ) // self.downsample_factor
+            mask_downsampled[i, :latent_length] = True
+        mask_latent = mask_downsampled.unsqueeze(1)  # (B, 1, T_latent)
+        # Compute KL loss per element
+        kl_per_element = -0.5 * (1 + log_var - mu.pow(2) - log_var.exp())
+        # Apply mask: only compute KL loss for valid timesteps
+        kl_masked = kl_per_element * mask_latent
+        # Sum over all dimensions and normalize by the number of valid elements
+        kl_loss = torch.sum(kl_masked) / (
+            torch.sum(mask_downsampled) * mu.size(1)
+        )  # normalize by valid timesteps * latent_dim
+        # Total loss
+        total_loss = (
+            self.LAMBDA_FEATURE * loss_recons
+            + self.LAMBDA_VELOCITY * loss_vel
+            + self.LAMBDA_KL * kl_loss
+        )
+        loss_dict = {}
+        loss_dict["total"] = total_loss
+        loss_dict["recons"] = loss_recons
+        loss_dict["velocity"] = loss_vel
+        loss_dict["kl"] = kl_loss
+        return loss_dict
+    def encode(self, x):
+        x = (x - self.mean) / self.std
+        x_in = self.preprocess(x)  # (bs, T, input_dim) -> (bs, input_dim, T)
+        mu = self.model.encode(x_in, scale=[0, 1])  # (bs, z_dim, T)
+        mu = self.postprocess(mu)  # (bs, T, z_dim)
+        return mu
+    def decode(self, mu):
+        mu_in = self.preprocess(mu)  # (bs, T, z_dim) -> (bs, z_dim, T)
+        x_decoder = self.model.decode(mu_in, scale=[0, 1])  # (bs, z_dim, T)
+        x_out = self.postprocess(x_decoder)  # (bs, T, input_dim)
+        x_out = x_out * self.std + self.mean
+        return x_out
+    @torch.no_grad()
+    def stream_encode(self, x, first_chunk=True):
+        x = (x - self.mean) / self.std
+        x_in = self.preprocess(x)  # (bs, input_dim, T)
+        mu = self.model.stream_encode(x_in, first_chunk=first_chunk, scale=[0, 1])
+        mu = self.postprocess(mu)  # (bs, T, z_dim)
+        return mu
+    @torch.no_grad()
+    def stream_decode(self, mu, first_chunk=True):
+        mu_in = self.preprocess(mu)  # (bs, z_dim, T)
+        x_decoder = self.model.stream_decode(
+            mu_in, first_chunk=first_chunk, scale=[0, 1]
+        )
+        x_out = self.postprocess(x_decoder)  # (bs, T, input_dim)
+        x_out = x_out * self.std + self.mean
+        return x_out
+    def clear_cache(self):
+        self.model.clear_cache()
+    def generate(self, x):
+        features = x["feature"]
+        feature_length = x["feature_length"]
+        y_hat = self.decode(self.encode(features))
+        y_hat_out = []
+        for i in range(y_hat.shape[0]):
+            # cut off the padding and align lengths
+            valid_len = (
+                feature_length[i] - 1
+            ) // self.downsample_factor * self.downsample_factor + 1
+            # Make sure both have the same length (take minimum)
+            y_hat_out.append(y_hat[i, :valid_len, :])
+        out = {}
+        out["generated"] = y_hat_out
+        return out

ldf_utils/__init__.py ADDED Viewed

File without changes

ldf_utils/initialize.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import argparse
+import os
+import shutil
+import time
+from datetime import datetime
+from importlib import import_module
+from pathlib import Path
+from typing import Any, Dict, Optional
+import torch
+from lightning.pytorch.utilities import rank_zero_info
+from omegaconf import OmegaConf
+class Config:
+    def __init__(self, config_path: str = None, override_args: Dict[str, Any] = None):
+        self.config = OmegaConf.create({})
+        # Load main config if provided
+        if config_path:
+            self.load_yaml(config_path)
+        if override_args:
+            self.override_config(override_args)
+    def load_yaml(self, config_path: str):
+        """Load YAML configuration file"""
+        loaded_config = OmegaConf.load(config_path)
+        self.config = OmegaConf.merge(self.config, loaded_config)
+    def override_config(self, override_args: Dict[str, Any]):
+        """Handle command line override arguments"""
+        dotlist = []
+        for key, value in override_args.items():
+            # Handle values that might be converted types but should be strings for paths
+            # The user issue "modify a path having suffix ..yaml" suggests type inference might be wrong
+            # or splitting logic is wrong.
+            # Using OmegaConf's standard from_dotlist approach is safest.
+            # It expects "key=value" strings.
+            # We need to be careful about value conversion.
+            # Our _convert_value handles basic types.
+            val = self._convert_value(value)
+            # If val is a string, we keep it as is.
+            # OmegaConf.from_dotlist parses the string again if we pass "key=value".
+            # But we can construct a config from dict and merge.
+            # If we use OmegaConf.update(self.config, key, val) it should work for dotted keys.
+            # However, `update` takes a key and value.
+            OmegaConf.update(self.config, key, val)
+    def _convert_value(self, value: str) -> Any:
+        """Convert string value to appropriate type"""
+        if value.lower() == "true":
+            return True
+        elif value.lower() == "false":
+            return False
+        elif value.lower() == "null":
+            return None
+        try:
+            return int(value)
+        except ValueError:
+            try:
+                return float(value)
+            except ValueError:
+                return value
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get configuration value"""
+        return OmegaConf.select(self.config, key, default=default)
+    def __getattr__(self, name: str) -> Any:
+        """Support dot notation access"""
+        return self.config[name]
+    def __getitem__(self, key: str) -> Any:
+        """Support dictionary-like access"""
+        return self.config[key]
+    def export_config(self, path: str):
+        """Export current configuration to file"""
+        OmegaConf.save(self.config, path)
+def parse_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", type=str, required=True, help="Path to config file"
+    )
+    parser.add_argument(
+        "--override", type=str, nargs="+", help="Override config values (key=value)"
+    )
+    return parser.parse_args()
+def load_config(
+    config_path: Optional[str] = None, override_args: Optional[Dict[str, Any]] = None
+) -> Config:
+    """Load configuration"""
+    if config_path is None:
+        args = parse_args()
+        config_path = args.config
+        if args.override:
+            override_args = {}
+            for override in args.override:
+                key, value = override.split("=", 1)
+                override_args[key.strip()] = value.strip()
+    return Config(config_path, override_args)
+def instantiate(target, cfg=None, hfstyle=False, **init_args):
+    module_name, class_name = target.rsplit(".", 1)
+    module = import_module(module_name)
+    class_ = getattr(module, class_name)
+    if cfg is None:
+        return class_(**init_args)
+    else:
+        if hfstyle:
+            config_class = class_.config_class
+            cfg = config_class(config_obj=cfg)
+        return class_(cfg, **init_args)
+def get_function(target):
+    module_name, function_name = target.rsplit(".", 1)
+    module = import_module(module_name)
+    function_ = getattr(module, function_name)
+    return function_
+def save_config_and_codes(config, save_dir):
+    os.makedirs(save_dir, exist_ok=True)
+    sanity_check_dir = os.path.join(save_dir, "sanity_check")
+    os.makedirs(sanity_check_dir, exist_ok=True)
+    with open(os.path.join(sanity_check_dir, f"{config.exp_name}.yaml"), "w") as f:
+        OmegaConf.save(config.config, f)
+    current_dir = Path.cwd()
+    exclude_dir = current_dir / "outputs"
+    for py_file in current_dir.rglob("*.py"):
+        if exclude_dir in py_file.parents:
+            continue
+        dest_path = Path(sanity_check_dir) / py_file.relative_to(current_dir)
+        dest_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(py_file, dest_path)
+def print_model_size(model):
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    rank_zero_info(f"Total parameters: {total_params:,}")
+    rank_zero_info(f"Trainable parameters: {trainable_params:,}")
+    rank_zero_info(f"Non-trainable parameters: {(total_params - trainable_params):,}")
+def compare_statedict_and_parameters(state_dict, named_parameters, named_buffers):
+    """Compare differences between state_dict and parameters"""
+    # Get all keys in state_dict
+    state_dict_keys = set(state_dict.keys())
+    # Get all keys in named_parameters
+    named_params_keys = set(name for name, _ in named_parameters)
+    # Find keys that only exist in state_dict
+    only_in_state_dict = state_dict_keys - named_params_keys
+    # Find keys that only exist in named_parameters
+    only_in_named_params = named_params_keys - state_dict_keys
+    # Print results
+    if only_in_state_dict:
+        print(f"Only in state_dict (not in parameters): {sorted(only_in_state_dict)}")
+    if only_in_named_params:
+        print(
+            f"Only in named_parameters (not in state_dict): {sorted(only_in_named_params)}"
+        )
+    if not only_in_state_dict and not only_in_named_params:
+        print("All parameters match between state_dict and named_parameters")
+    # Additionally compare buffers (non-parameter states, such as BatchNorm's running_mean)
+    named_buffers_keys = set(name for name, _ in named_buffers)
+    buffers_only = state_dict_keys - named_params_keys - named_buffers_keys
+    if buffers_only:
+        print(
+            f"Other items in state_dict (neither params nor buffers): {sorted(buffers_only)}"
+        )
+    print(f"Total state_dict items: {len(state_dict_keys)}")
+    print(f"Total named_parameters: {len(named_params_keys)}")
+    print(f"Total named_buffers: {len(named_buffers_keys)}")
+def _resolve_global_rank() -> int:
+    """Resolve the global rank from environment variables."""
+    for key in ("GLOBAL_RANK", "RANK", "SLURM_PROCID", "LOCAL_RANK"):
+        if key in os.environ:
+            try:
+                return int(os.environ[key])
+            except ValueError:
+                continue
+    return 0
+def get_shared_run_time(base_dir: str, env_key: str = "PL_RUN_TIME") -> str:
+    """
+    Get a synchronized run time across all processes.
+    This function ensures all processes (both in distributed training and multi-process
+    scenarios) use the same timestamp for output directories and experiment tracking.
+    Args:
+        base_dir: Base directory for output files
+        env_key: Environment variable key to cache the run time
+    Returns:
+        Synchronized timestamp string in format YYYYMMDD_HHMMSS
+    """
+    cached = os.environ.get(env_key)
+    if cached:
+        return cached
+    timestamp_format = "%Y%m%d_%H%M%S"
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            run_time = datetime.now().strftime(timestamp_format)
+        else:
+            run_time = None
+        container = [run_time]
+        torch.distributed.broadcast_object_list(container, src=0)
+        run_time = container[0]
+        if run_time is None:
+            raise RuntimeError("Failed to synchronize run time across ranks.")
+        os.environ[env_key] = run_time
+        return run_time
+    os.makedirs(base_dir, exist_ok=True)
+    sync_token = (
+        os.environ.get("SLURM_JOB_ID")
+        or os.environ.get("TORCHELASTIC_RUN_ID")
+        or os.environ.get("JOB_ID")
+        or "default"
+    )
+    sync_dir = os.path.join(base_dir, ".run_time_sync")
+    os.makedirs(sync_dir, exist_ok=True)
+    sync_file = os.path.join(sync_dir, f"{sync_token}.txt")
+    global_rank = _resolve_global_rank()
+    if global_rank == 0:
+        # Remove the sync file if it exists to avoid stale reads by other ranks
+        if os.path.exists(sync_file):
+            try:
+                os.remove(sync_file)
+            except OSError:
+                pass
+        run_time = datetime.now().strftime(timestamp_format)
+        with open(sync_file, "w", encoding="utf-8") as f:
+            f.write(run_time)
+    else:
+        timeout = time.monotonic() + 1200.0
+        while True:
+            if os.path.exists(sync_file):
+                try:
+                    with open(sync_file, "r", encoding="utf-8") as f:
+                        run_time = f.read().strip()
+                    # Check if the timestamp is fresh (within 60 seconds)
+                    # This prevents reading a stale timestamp from a previous run
+                    dt = datetime.strptime(run_time, timestamp_format)
+                    if abs((datetime.now() - dt).total_seconds()) < 60:
+                        break
+                except (ValueError, OSError):
+                    # File might be empty or partially written, or format mismatch
+                    pass
+            if time.monotonic() > timeout:
+                raise TimeoutError(
+                    "Timed out waiting for rank 0 to write synchronized timestamp."
+                )
+            time.sleep(0.1)
+    os.environ[env_key] = run_time
+    return run_time

ldf_utils/math/__init__.py ADDED Viewed

File without changes

ldf_utils/math/quaternion.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright (c) 2018-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import numpy as np
+import torch
+_EPS4 = np.finfo(float).eps * 4.0
+_FLOAT_EPS = np.finfo(np.float64).eps
+# PyTorch-backed implementations
+def qinv(q):
+    assert q.shape[-1] == 4, "q must be a tensor of shape (*, 4)"
+    mask = torch.ones_like(q)
+    mask[..., 1:] = -mask[..., 1:]
+    return q * mask
+def qinv_np(q):
+    assert q.shape[-1] == 4, "q must be a tensor of shape (*, 4)"
+    return qinv(torch.from_numpy(q).float()).numpy()
+def qnormalize(q):
+    assert q.shape[-1] == 4, "q must be a tensor of shape (*, 4)"
+    return q / torch.norm(q, dim=-1, keepdim=True)
+def qmul(q, r):
+    """
+    Multiply quaternion(s) q with quaternion(s) r.
+    Expects two equally-sized tensors of shape (*, 4), where * denotes any number of dimensions.
+    Returns q*r as a tensor of shape (*, 4).
+    """
+    assert q.shape[-1] == 4
+    assert r.shape[-1] == 4
+    original_shape = q.shape
+    # Compute outer product
+    terms = torch.bmm(r.view(-1, 4, 1), q.view(-1, 1, 4))
+    w = terms[:, 0, 0] - terms[:, 1, 1] - terms[:, 2, 2] - terms[:, 3, 3]
+    x = terms[:, 0, 1] + terms[:, 1, 0] - terms[:, 2, 3] + terms[:, 3, 2]
+    y = terms[:, 0, 2] + terms[:, 1, 3] + terms[:, 2, 0] - terms[:, 3, 1]
+    z = terms[:, 0, 3] - terms[:, 1, 2] + terms[:, 2, 1] + terms[:, 3, 0]
+    return torch.stack((w, x, y, z), dim=1).view(original_shape)
+def qrot(q, v):
+    """
+    Rotate vector(s) v about the rotation described by quaternion(s) q.
+    Expects a tensor of shape (*, 4) for q and a tensor of shape (*, 3) for v,
+    where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    assert v.shape[-1] == 3
+    assert q.shape[:-1] == v.shape[:-1]
+    original_shape = list(v.shape)
+    # print(q.shape)
+    q = q.contiguous().view(-1, 4)
+    v = v.contiguous().view(-1, 3)
+    qvec = q[:, 1:]
+    uv = torch.cross(qvec, v, dim=1)
+    uuv = torch.cross(qvec, uv, dim=1)
+    return (v + 2 * (q[:, :1] * uv + uuv)).view(original_shape)
+def qeuler(q, order, epsilon=0, deg=True):
+    """
+    Convert quaternion(s) q to Euler angles.
+    Expects a tensor of shape (*, 4), where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 3).
+    """
+    assert q.shape[-1] == 4
+    original_shape = list(q.shape)
+    original_shape[-1] = 3
+    q = q.view(-1, 4)
+    q0 = q[:, 0]
+    q1 = q[:, 1]
+    q2 = q[:, 2]
+    q3 = q[:, 3]
+    if order == "xyz":
+        x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        y = torch.asin(torch.clamp(2 * (q1 * q3 + q0 * q2), -1 + epsilon, 1 - epsilon))
+        z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
+    elif order == "yzx":
+        x = torch.atan2(2 * (q0 * q1 - q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+        y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
+        z = torch.asin(torch.clamp(2 * (q1 * q2 + q0 * q3), -1 + epsilon, 1 - epsilon))
+    elif order == "zxy":
+        x = torch.asin(torch.clamp(2 * (q0 * q1 + q2 * q3), -1 + epsilon, 1 - epsilon))
+        y = torch.atan2(2 * (q0 * q2 - q1 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        z = torch.atan2(2 * (q0 * q3 - q1 * q2), 1 - 2 * (q1 * q1 + q3 * q3))
+    elif order == "xzy":
+        x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+        y = torch.atan2(2 * (q0 * q2 + q1 * q3), 1 - 2 * (q2 * q2 + q3 * q3))
+        z = torch.asin(torch.clamp(2 * (q0 * q3 - q1 * q2), -1 + epsilon, 1 - epsilon))
+    elif order == "yxz":
+        x = torch.asin(torch.clamp(2 * (q0 * q1 - q2 * q3), -1 + epsilon, 1 - epsilon))
+        y = torch.atan2(2 * (q1 * q3 + q0 * q2), 1 - 2 * (q1 * q1 + q2 * q2))
+        z = torch.atan2(2 * (q1 * q2 + q0 * q3), 1 - 2 * (q1 * q1 + q3 * q3))
+    elif order == "zyx":
+        x = torch.atan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+        y = torch.asin(torch.clamp(2 * (q0 * q2 - q1 * q3), -1 + epsilon, 1 - epsilon))
+        z = torch.atan2(2 * (q0 * q3 + q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
+    else:
+        raise
+    if deg:
+        return torch.stack((x, y, z), dim=1).view(original_shape) * 180 / np.pi
+    else:
+        return torch.stack((x, y, z), dim=1).view(original_shape)
+# Numpy-backed implementations
+def qmul_np(q, r):
+    q = torch.from_numpy(q).contiguous().float()
+    r = torch.from_numpy(r).contiguous().float()
+    return qmul(q, r).numpy()
+def qrot_np(q, v):
+    q = torch.from_numpy(q).contiguous().float()
+    v = torch.from_numpy(v).contiguous().float()
+    return qrot(q, v).numpy()
+def qeuler_np(q, order, epsilon=0, use_gpu=False):
+    if use_gpu:
+        q = torch.from_numpy(q).cuda().float()
+        return qeuler(q, order, epsilon).cpu().numpy()
+    else:
+        q = torch.from_numpy(q).contiguous().float()
+        return qeuler(q, order, epsilon).numpy()
+def qfix(q):
+    """
+    Enforce quaternion continuity across the time dimension by selecting
+    the representation (q or -q) with minimal distance (or, equivalently, maximal dot product)
+    between two consecutive frames.
+    Expects a tensor of shape (L, J, 4), where L is the sequence length and J is the number of joints.
+    Returns a tensor of the same shape.
+    """
+    assert len(q.shape) == 3
+    assert q.shape[-1] == 4
+    result = q.copy()
+    dot_products = np.sum(q[1:] * q[:-1], axis=2)
+    mask = dot_products < 0
+    mask = (np.cumsum(mask, axis=0) % 2).astype(bool)
+    result[1:][mask] *= -1
+    return result
+def euler2quat(e, order, deg=True):
+    """
+    Convert Euler angles to quaternions.
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.view(-1, 3)
+    # if euler angles in degrees
+    if deg:
+        e = e * np.pi / 180.0
+    x = e[:, 0]
+    y = e[:, 1]
+    z = e[:, 2]
+    rx = torch.stack(
+        (torch.cos(x / 2), torch.sin(x / 2), torch.zeros_like(x), torch.zeros_like(x)),
+        dim=1,
+    )
+    ry = torch.stack(
+        (torch.cos(y / 2), torch.zeros_like(y), torch.sin(y / 2), torch.zeros_like(y)),
+        dim=1,
+    )
+    rz = torch.stack(
+        (torch.cos(z / 2), torch.zeros_like(z), torch.zeros_like(z), torch.sin(z / 2)),
+        dim=1,
+    )
+    result = None
+    for coord in order:
+        if coord == "x":
+            r = rx
+        elif coord == "y":
+            r = ry
+        elif coord == "z":
+            r = rz
+        else:
+            raise
+        if result is None:
+            result = r
+        else:
+            result = qmul(result, r)
+    # Reverse antipodal representation to have a non-negative "w"
+    if order in ["xyz", "yzx", "zxy"]:
+        result *= -1
+    return result.view(original_shape)
+def expmap_to_quaternion(e):
+    """
+    Convert axis-angle rotations (aka exponential maps) to quaternions.
+    Stable formula from "Practical Parameterization of Rotations Using the Exponential Map".
+    Expects a tensor of shape (*, 3), where * denotes any number of dimensions.
+    Returns a tensor of shape (*, 4).
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.reshape(-1, 3)
+    theta = np.linalg.norm(e, axis=1).reshape(-1, 1)
+    w = np.cos(0.5 * theta).reshape(-1, 1)
+    xyz = 0.5 * np.sinc(0.5 * theta / np.pi) * e
+    return np.concatenate((w, xyz), axis=1).reshape(original_shape)
+def euler_to_quaternion(e, order):
+    """
+    Convert Euler angles to quaternions.
+    """
+    assert e.shape[-1] == 3
+    original_shape = list(e.shape)
+    original_shape[-1] = 4
+    e = e.reshape(-1, 3)
+    x = e[:, 0]
+    y = e[:, 1]
+    z = e[:, 2]
+    rx = np.stack(
+        (np.cos(x / 2), np.sin(x / 2), np.zeros_like(x), np.zeros_like(x)), axis=1
+    )
+    ry = np.stack(
+        (np.cos(y / 2), np.zeros_like(y), np.sin(y / 2), np.zeros_like(y)), axis=1
+    )
+    rz = np.stack(
+        (np.cos(z / 2), np.zeros_like(z), np.zeros_like(z), np.sin(z / 2)), axis=1
+    )
+    result = None
+    for coord in order:
+        if coord == "x":
+            r = rx
+        elif coord == "y":
+            r = ry
+        elif coord == "z":
+            r = rz
+        else:
+            raise
+        if result is None:
+            result = r
+        else:
+            result = qmul_np(result, r)
+    # Reverse antipodal representation to have a non-negative "w"
+    if order in ["xyz", "yzx", "zxy"]:
+        result *= -1
+    return result.reshape(original_shape)
+def quaternion_to_matrix(quaternions):
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def quaternion_to_matrix_np(quaternions):
+    q = torch.from_numpy(quaternions).contiguous().float()
+    return quaternion_to_matrix(q).numpy()
+def quaternion_to_cont6d_np(quaternions):
+    rotation_mat = quaternion_to_matrix_np(quaternions)
+    cont_6d = np.concatenate([rotation_mat[..., 0], rotation_mat[..., 1]], axis=-1)
+    return cont_6d
+def quaternion_to_cont6d(quaternions):
+    rotation_mat = quaternion_to_matrix(quaternions)
+    cont_6d = torch.cat([rotation_mat[..., 0], rotation_mat[..., 1]], dim=-1)
+    return cont_6d
+def cont6d_to_matrix(cont6d):
+    assert cont6d.shape[-1] == 6, "The last dimension must be 6"
+    x_raw = cont6d[..., 0:3]
+    y_raw = cont6d[..., 3:6]
+    x = x_raw / torch.norm(x_raw, dim=-1, keepdim=True)
+    z = torch.cross(x, y_raw, dim=-1)
+    z = z / torch.norm(z, dim=-1, keepdim=True)
+    y = torch.cross(z, x, dim=-1)
+    x = x[..., None]
+    y = y[..., None]
+    z = z[..., None]
+    mat = torch.cat([x, y, z], dim=-1)
+    return mat
+def cont6d_to_matrix_np(cont6d):
+    q = torch.from_numpy(cont6d).contiguous().float()
+    return cont6d_to_matrix(q).numpy()
+def qpow(q0, t, dtype=torch.float):
+    """q0 : tensor of quaternions
+    t: tensor of powers
+    """
+    q0 = qnormalize(q0)
+    theta0 = torch.acos(q0[..., 0])
+    # if theta0 is close to zero, add epsilon to avoid NaNs
+    mask = (theta0 <= 10e-10) * (theta0 >= -10e-10)
+    theta0 = (1 - mask) * theta0 + mask * 10e-10
+    v0 = q0[..., 1:] / torch.sin(theta0).view(-1, 1)
+    if isinstance(t, torch.Tensor):
+        q = torch.zeros(t.shape + q0.shape)
+        theta = t.view(-1, 1) * theta0.view(1, -1)
+    else:  # if t is a number
+        q = torch.zeros(q0.shape)
+        theta = t * theta0
+    q[..., 0] = torch.cos(theta)
+    q[..., 1:] = v0 * torch.sin(theta).unsqueeze(-1)
+    return q.to(dtype)
+def qslerp(q0, q1, t):
+    """
+    q0: starting quaternion
+    q1: ending quaternion
+    t: array of points along the way
+    Returns:
+    Tensor of Slerps: t.shape + q0.shape
+    """
+    q0 = qnormalize(q0)
+    q1 = qnormalize(q1)
+    q_ = qpow(qmul(q1, qinv(q0)), t)
+    return qmul(
+        q_,
+        q0.contiguous()
+        .view(torch.Size([1] * len(t.shape)) + q0.shape)
+        .expand(t.shape + q0.shape)
+        .contiguous(),
+    )
+def qbetween(v0, v1):
+    """
+    find the quaternion used to rotate v0 to v1
+    """
+    assert v0.shape[-1] == 3, "v0 must be of the shape (*, 3)"
+    assert v1.shape[-1] == 3, "v1 must be of the shape (*, 3)"
+    v = torch.cross(v0, v1)
+    w = torch.sqrt(
+        (v0**2).sum(dim=-1, keepdim=True) * (v1**2).sum(dim=-1, keepdim=True)
+    ) + (v0 * v1).sum(dim=-1, keepdim=True)
+    return qnormalize(torch.cat([w, v], dim=-1))
+def qbetween_np(v0, v1):
+    """
+    find the quaternion used to rotate v0 to v1
+    """
+    assert v0.shape[-1] == 3, "v0 must be of the shape (*, 3)"
+    assert v1.shape[-1] == 3, "v1 must be of the shape (*, 3)"
+    v0 = torch.from_numpy(v0).float()
+    v1 = torch.from_numpy(v1).float()
+    return qbetween(v0, v1).numpy()
+def lerp(p0, p1, t):
+    if not isinstance(t, torch.Tensor):
+        t = torch.Tensor([t])
+    new_shape = t.shape + p0.shape
+    new_view_t = t.shape + torch.Size([1] * len(p0.shape))
+    new_view_p = torch.Size([1] * len(t.shape)) + p0.shape
+    p0 = p0.view(new_view_p).expand(new_shape)
+    p1 = p1.view(new_view_p).expand(new_shape)
+    t = t.view(new_view_t).expand(new_shape)
+    return p0 + t * (p1 - p0)

ldf_utils/motion_process.py ADDED Viewed

	@@ -0,0 +1,365 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from ldf_utils.math.quaternion import *
+"""
+Motion data structure:
+(B: batch size)
+root_rot_velocity (B, seq_len, 1)
+root_linear_velocity (B, seq_len, 2)
+root_y (B, seq_len, 1)
+ric_data (B, seq_len, (joint_num - 1)*3)
+rot_data (B, seq_len, (joint_num - 1)*6)
+local_velocity (B, seq_len, joint_num*3)
+foot contact (B, seq_len, 4)
+"""
+def recover_root_rot_pos(data):
+    # recover root rotation and position
+    rot_vel = data[..., 0]
+    r_rot_ang = torch.zeros_like(rot_vel).to(data.device)
+    """Get Y-axis rotation from rotation velocity"""
+    r_rot_ang[..., 1:] = rot_vel[..., :-1]
+    r_rot_ang = torch.cumsum(r_rot_ang, dim=-1)
+    r_rot_quat = torch.zeros(data.shape[:-1] + (4,)).to(data.device)
+    r_rot_quat[..., 0] = torch.cos(r_rot_ang)
+    r_rot_quat[..., 2] = torch.sin(r_rot_ang)
+    r_pos = torch.zeros(data.shape[:-1] + (3,)).to(data.device)
+    r_pos[..., 1:, [0, 2]] = data[..., :-1, 1:3]
+    """Add Y-axis rotation to root position"""
+    r_pos = qrot(qinv(r_rot_quat), r_pos)
+    r_pos = torch.cumsum(r_pos, dim=-2)
+    r_pos[..., 1] = data[..., 3]
+    return r_rot_quat, r_pos
+def recover_joint_positions_263(data: np.ndarray, joints_num) -> np.ndarray:
+    """
+    Recovers 3D joint positions from the rotation-invariant local positions (ric_data).
+    This is the most direct way to get the skeleton for animation.
+    """
+    feature_vec = torch.from_numpy(data).unsqueeze(0).float()
+    r_rot_quat, r_pos = recover_root_rot_pos(feature_vec)
+    positions = feature_vec[..., 4 : (joints_num - 1) * 3 + 4]
+    positions = positions.view(positions.shape[:-1] + (-1, 3))
+    """Add Y-axis rotation to local joints"""
+    positions = qrot(
+        qinv(r_rot_quat[..., None, :]).expand(positions.shape[:-1] + (4,)), positions
+    )
+    """Add root XZ to joints"""
+    positions[..., 0] += r_pos[..., 0:1]
+    positions[..., 2] += r_pos[..., 2:3]
+    """Concatenate root and joints"""
+    positions = torch.cat([r_pos.unsqueeze(-2), positions], dim=-2)
+    joints_np = positions.squeeze(0).detach().cpu().numpy()
+    return joints_np
+class StreamJointRecovery263:
+    """
+    Stream version of recover_joint_positions_263 that processes one frame at a time.
+    Maintains cumulative state for rotation angles and positions.
+    Key insight: The batch version uses PREVIOUS frame's velocity for the current frame,
+    so we need to delay the velocity application by one frame.
+    Args:
+        joints_num: Number of joints in the skeleton
+        smoothing_alpha: EMA smoothing factor (0.0 to 1.0)
+            - 1.0 = no smoothing (default), output follows input exactly
+            - 0.0 = infinite smoothing, output never changes
+            - Recommended values: 0.3-0.7 for visible smoothing
+            - Formula: smoothed = alpha * current + (1 - alpha) * previous
+    """
+    def __init__(self, joints_num: int, smoothing_alpha: float = 1.0):
+        self.joints_num = joints_num
+        self.smoothing_alpha = np.clip(smoothing_alpha, 0.0, 1.0)
+        self.reset()
+    def reset(self):
+        """Reset the accumulated state"""
+        self.r_rot_ang_accum = 0.0
+        self.r_pos_accum = np.array([0.0, 0.0, 0.0])
+        # Store previous frame's velocities for delayed application
+        self.prev_rot_vel = 0.0
+        self.prev_linear_vel = np.array([0.0, 0.0])
+        # Store previous smoothed joints for EMA
+        self.prev_smoothed_joints = None
+    def process_frame(self, frame_data: np.ndarray) -> np.ndarray:
+        """
+        Process a single frame and return joint positions for that frame.
+        Args:
+            frame_data: numpy array of shape (263,) for a single frame
+        Returns:
+            joints: numpy array of shape (joints_num, 3) representing joint positions
+        """
+        # Convert to torch tensor
+        feature_vec = torch.from_numpy(frame_data).float()
+        # Extract current frame's velocities (will be used in NEXT frame)
+        curr_rot_vel = feature_vec[0].item()
+        curr_linear_vel = feature_vec[1:3].numpy()
+        # Update accumulated rotation angle with PREVIOUS frame's velocity FIRST
+        # This matches the batch processing: r_rot_ang[i] uses rot_vel[i-1]
+        self.r_rot_ang_accum += self.prev_rot_vel
+        # Calculate current rotation quaternion using updated accumulated angle
+        r_rot_quat = torch.zeros(4)
+        r_rot_quat[0] = np.cos(self.r_rot_ang_accum)
+        r_rot_quat[2] = np.sin(self.r_rot_ang_accum)
+        # Create velocity vector with Y=0 using PREVIOUS frame's velocity
+        r_vel = np.array([self.prev_linear_vel[0], 0.0, self.prev_linear_vel[1]])
+        # Apply inverse rotation to velocity using CURRENT rotation
+        r_vel_torch = torch.from_numpy(r_vel).float()
+        r_vel_rotated = qrot(qinv(r_rot_quat).unsqueeze(0), r_vel_torch.unsqueeze(0))
+        r_vel_rotated = r_vel_rotated.squeeze(0).numpy()
+        # Update accumulated position with rotated velocity
+        self.r_pos_accum += r_vel_rotated
+        # Get Y position from data
+        r_pos = self.r_pos_accum.copy()
+        r_pos[1] = feature_vec[3].item()
+        # Extract local joint positions
+        positions = feature_vec[4 : (self.joints_num - 1) * 3 + 4]
+        positions = positions.view(-1, 3)
+        # Apply inverse rotation to local joints
+        r_rot_quat_expanded = (
+            qinv(r_rot_quat).unsqueeze(0).expand(positions.shape[0], 4)
+        )
+        positions = qrot(r_rot_quat_expanded, positions)
+        # Add root XZ to joints
+        positions[:, 0] += r_pos[0]
+        positions[:, 2] += r_pos[2]
+        # Concatenate root and joints
+        r_pos_torch = torch.from_numpy(r_pos).float()
+        positions = torch.cat([r_pos_torch.unsqueeze(0), positions], dim=0)
+        # Convert to numpy
+        joints_np = positions.detach().cpu().numpy()
+        # Apply EMA smoothing if enabled
+        if self.smoothing_alpha < 1.0:
+            if self.prev_smoothed_joints is None:
+                # First frame, no smoothing possible
+                self.prev_smoothed_joints = joints_np.copy()
+            else:
+                # EMA: smoothed = alpha * current + (1 - alpha) * previous
+                joints_np = (
+                    self.smoothing_alpha * joints_np
+                    + (1.0 - self.smoothing_alpha) * self.prev_smoothed_joints
+                )
+                self.prev_smoothed_joints = joints_np.copy()
+        # Store current velocities for next frame
+        self.prev_rot_vel = curr_rot_vel
+        self.prev_linear_vel = curr_linear_vel
+        return joints_np
+def accumulate_rotations(relative_rotations):
+    R_total = [relative_rotations[0]]
+    for R_rel in relative_rotations[1:]:
+        R_total.append(np.matmul(R_rel, R_total[-1]))
+    return np.array(R_total)
+def recover_from_local_position(final_x, njoint):
+    nfrm, _ = final_x.shape
+    positions_no_heading = final_x[:, 8 : 8 + 3 * njoint].reshape(
+        nfrm, -1, 3
+    )  # frames, njoints * 3
+    velocities_root_xy_no_heading = final_x[:, :2]  # frames, 2
+    global_heading_diff_rot = final_x[:, 2:8]  # frames, 6
+    # recover global heading
+    global_heading_rot = accumulate_rotations(
+        rotation_6d_to_matrix(torch.from_numpy(global_heading_diff_rot)).numpy()
+    )
+    inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
+    # add global heading to position
+    positions_with_heading = np.matmul(
+        np.repeat(inv_global_heading_rot[:, None, :, :], njoint, axis=1),
+        positions_no_heading[..., None],
+    ).squeeze(-1)
+    # recover root translation
+    # add heading to velocities_root_xy_no_heading
+    velocities_root_xyz_no_heading = np.zeros(
+        (
+            velocities_root_xy_no_heading.shape[0],
+            3,
+        )
+    )
+    velocities_root_xyz_no_heading[:, 0] = velocities_root_xy_no_heading[:, 0]
+    velocities_root_xyz_no_heading[:, 2] = velocities_root_xy_no_heading[:, 1]
+    velocities_root_xyz_no_heading[1:, :] = np.matmul(
+        inv_global_heading_rot[:-1], velocities_root_xyz_no_heading[1:, :, None]
+    ).squeeze(-1)
+    root_translation = np.cumsum(velocities_root_xyz_no_heading, axis=0)
+    # add root translation
+    positions_with_heading[:, :, 0] += root_translation[:, 0:1]
+    positions_with_heading[:, :, 2] += root_translation[:, 2:]
+    return positions_with_heading
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+def _copysign(a, b):
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+def _sqrt_positive_part(x):
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def matrix_to_quaternion(matrix):
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix  shape f{matrix.shape}.")
+    m00 = matrix[..., 0, 0]
+    m11 = matrix[..., 1, 1]
+    m22 = matrix[..., 2, 2]
+    o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22)
+    x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22)
+    y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22)
+    z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22)
+    o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2])
+    o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0])
+    o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1])
+    return torch.stack((o0, o1, o2, o3), -1)
+def quaternion_to_axis_angle(quaternions):
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+def matrix_to_axis_angle(matrix):
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+def rotations_matrix_to_smpl85(rotations_matrix, translation):
+    nfrm, njoint, _, _ = rotations_matrix.shape
+    axis_angle = (
+        matrix_to_axis_angle(torch.from_numpy(rotations_matrix))
+        .numpy()
+        .reshape(nfrm, -1)
+    )
+    smpl_85 = np.concatenate(
+        [axis_angle, np.zeros((nfrm, 6)), translation, np.zeros((nfrm, 10))], axis=-1
+    )
+    return smpl_85
+def recover_from_local_rotation(final_x, njoint):
+    nfrm, _ = final_x.shape
+    rotations_matrix = rotation_6d_to_matrix(
+        torch.from_numpy(final_x[:, 8 + 6 * njoint : 8 + 12 * njoint]).reshape(
+            nfrm, -1, 6
+        )
+    ).numpy()
+    global_heading_diff_rot = final_x[:, 2:8]
+    velocities_root_xy_no_heading = final_x[:, :2]
+    positions_no_heading = final_x[:, 8 : 8 + 3 * njoint].reshape(nfrm, -1, 3)
+    height = positions_no_heading[:, 0, 1]
+    global_heading_rot = accumulate_rotations(
+        rotation_6d_to_matrix(torch.from_numpy(global_heading_diff_rot)).numpy()
+    )
+    inv_global_heading_rot = np.transpose(global_heading_rot, (0, 2, 1))
+    # recover root rotation
+    rotations_matrix[:, 0, ...] = np.matmul(
+        inv_global_heading_rot, rotations_matrix[:, 0, ...]
+    )
+    velocities_root_xyz_no_heading = np.zeros(
+        (
+            velocities_root_xy_no_heading.shape[0],
+            3,
+        )
+    )
+    velocities_root_xyz_no_heading[:, 0] = velocities_root_xy_no_heading[:, 0]
+    velocities_root_xyz_no_heading[:, 2] = velocities_root_xy_no_heading[:, 1]
+    velocities_root_xyz_no_heading[1:, :] = np.matmul(
+        inv_global_heading_rot[:-1], velocities_root_xyz_no_heading[1:, :, None]
+    ).squeeze(-1)
+    root_translation = np.cumsum(velocities_root_xyz_no_heading, axis=0)
+    root_translation[:, 1] = height
+    smpl_85 = rotations_matrix_to_smpl85(rotations_matrix, root_translation)
+    return smpl_85
+def recover_joint_positions_272(data: np.ndarray, joints_num) -> np.ndarray:
+    return recover_from_local_position(data, joints_num)
+def convert_motion_to_joints(
+    motion_data: np.ndarray,
+    dim: int,
+    mean: np.ndarray = None,
+    std: np.ndarray = None,
+    joints_num=22,
+):
+    """
+    Convert Kx263 dim or Kx272 dim motion data to Kx22x3 joint positions.
+    Args:
+        motion_data: numpy array of shape (K, 263) or (K, 272) where K is number of frames
+    Returns:
+        joints: numpy array of shape (K, 22, 3) representing joint positions
+    """
+    if mean is not None and std is not None:
+        motion_data = motion_data * std + mean
+    if dim == 263:
+        recovered_positions = recover_joint_positions_263(motion_data, joints_num)
+    elif dim == 272:
+        recovered_positions = recover_joint_positions_272(motion_data, joints_num)
+    else:
+        raise ValueError(f"Unsupported motion data dimension: {dim}")
+    return recovered_positions

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3528a345e2795f0b28343896515adc2c14746567896c66620852678ff8d43a79
+size 36753080

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# Core dependencies
+torch>=2.0.0
+transformers>=4.30.0
+huggingface_hub>=0.16.0
+safetensors>=0.3.0
+diffusers>=0.20.0
+# Inference
+lightning>=2.0.0
+ftfy
+# Configuration
+omegaconf
+# Utilities
+numpy
+# Note: flash-attn is required but needs special installation
+# See README.md for installation instructions

vae.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a40164154c476309ff952a4b7563750b7e76fbdd8d263ec261ad877cf452e7b
+size 70027220