Spaces:

VAST-AI
/

AniGen

Running on Zero

App Files Files Community

Yihua7 commited on Apr 13

Commit

6b92ff7

0 Parent(s):

Initial commit: AniGen - Animatable 3D Generation

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +58 -0
Dockerfile +63 -0
README.md +231 -0
THIRD_PARTY_LICENSES.md +30 -0
anigen/__init__.py +6 -0
anigen/datasets/__init__.py +32 -0
anigen/datasets/anigen_sparse_feat2skeleton.py +290 -0
anigen/datasets/anigen_sparse_structure.py +124 -0
anigen/datasets/anigen_sparse_structure_latent.py +238 -0
anigen/datasets/anigen_structured_latent.py +327 -0
anigen/datasets/components.py +143 -0
anigen/models/__init__.py +67 -0
anigen/models/anigen_sparse_structure_flow.py +487 -0
anigen/models/anigen_sparse_structure_vae.py +729 -0
anigen/models/anigen_structured_latent_flow.py +553 -0
anigen/models/sparse_elastic_mixin.py +24 -0
anigen/models/structured_latent_vae/__init__.py +3 -0
anigen/models/structured_latent_vae/anigen_base.py +256 -0
anigen/models/structured_latent_vae/anigen_decoder.py +834 -0
anigen/models/structured_latent_vae/anigen_encoder.py +318 -0
anigen/models/structured_latent_vae/base.py +117 -0
anigen/models/structured_latent_vae/skin_models.py +252 -0
anigen/modules/attention/__init__.py +36 -0
anigen/modules/attention/full_attn.py +140 -0
anigen/modules/attention/modules.py +161 -0
anigen/modules/norm.py +25 -0
anigen/modules/sparse/__init__.py +102 -0
anigen/modules/sparse/attention/__init__.py +5 -0
anigen/modules/sparse/attention/full_attn.py +215 -0
anigen/modules/sparse/attention/modules.py +151 -0
anigen/modules/sparse/attention/serialized_attn.py +193 -0
anigen/modules/sparse/attention/windowed_attn.py +135 -0
anigen/modules/sparse/attention/windowed_attn_cross.py +131 -0
anigen/modules/sparse/basic.py +465 -0
anigen/modules/sparse/conv/__init__.py +21 -0
anigen/modules/sparse/conv/conv_spconv.py +80 -0
anigen/modules/sparse/conv/conv_torchsparse.py +38 -0
anigen/modules/sparse/linear.py +15 -0
anigen/modules/sparse/nonlinearity.py +35 -0
anigen/modules/sparse/norm.py +58 -0
anigen/modules/sparse/spatial.py +110 -0
anigen/modules/sparse/transformer/__init__.py +3 -0
anigen/modules/sparse/transformer/anigen_modulated.py +155 -0
anigen/modules/sparse/transformer/blocks.py +259 -0
anigen/modules/sparse/transformer/modulated.py +174 -0
anigen/modules/spatial.py +48 -0
anigen/modules/transformer/__init__.py +2 -0
anigen/modules/transformer/blocks.py +285 -0
anigen/modules/transformer/modulated.py +175 -0
anigen/modules/utils.py +54 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,58 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+anigen/representations/mesh/flexicubes/images/block_init.png filter=lfs diff=lfs merge=lfs -text
+anigen/representations/mesh/flexicubes/images/teaser_top.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/dog.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/lamp.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/machine_arm.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/owl.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/trex.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/whale.png filter=lfs diff=lfs merge=lfs -text
+assets/images/teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/machine_dog.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/spongebob.png filter=lfs diff=lfs merge=lfs -text
+assets/gifs/eagle.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/evo.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/horse.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/iron_boy.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/machine_arm.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/machine_dog.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/mairo.gif filter=lfs diff=lfs merge=lfs -text
+assets/gifs/money_tree.gif filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/brickbob.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/bruno_star.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/evo.png filter=lfs diff=lfs merge=lfs -text
+assets/cond_images/iron_boy.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,63 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.10 python3-pip python3.10-dev \
+    git git-lfs ffmpeg libsm6 libxext6 libgl1 libegl1 \
+    build-essential ninja-build cmake rsync \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HOME=/home/user/.cache/huggingface \
+    TORCH_HOME=/home/user/.cache/torch \
+    ATTN_BACKEND=xformers \
+    SPARSE_ATTN_BACKEND=xformers \
+    TORCH_CUDA_ARCH_LIST="7.5;8.6;8.9"
+WORKDIR $HOME/app
+COPY --chown=user:user . $HOME/app
+RUN python3.10 -m pip install --upgrade pip setuptools wheel
+RUN python3.10 -m pip install \
+    torch==2.4.0 torchvision==0.19.0 \
+    --index-url https://download.pytorch.org/whl/cu121
+RUN python3.10 -m pip install \
+    pillow imageio imageio-ffmpeg tqdm easydict scipy ninja psutil safetensors \
+    scikit-learn opencv-python-headless rembg onnxruntime \
+    trimesh xatlas pyvista pymeshfix igraph pygltflib geffnet \
+    transformers \
+    gradio==4.44.1 gradio_litmodel3d==0.0.1 "huggingface_hub<0.25"
+RUN python3.10 -m pip install \
+    git+https://github.com/EasternJournalist/utils3d.git@9a4eb15e4021b67b12c460c7057d642626897ec8
+RUN python3.10 -m pip install \
+    "pytorch3d==0.7.8" \
+    --find-links https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt240/download.html
+RUN python3.10 -m pip install \
+    xformers==0.0.27.post2 --index-url https://download.pytorch.org/whl/cu121
+RUN python3.10 -m pip install spconv-cu121
+RUN python3.10 -m pip install \
+    kaolin -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.4.0_cu121.html
+RUN python3.10 -m pip install \
+    "git+https://github.com/NVlabs/nvdiffrast.git" --no-build-isolation
+EXPOSE 7860
+CMD ["python3.10", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,231 @@

+---
+title: AniGen
+sdk: gradio
+sdk_version: 4.44.1
+python_version: 3.10.13
+startup_duration_timeout: 2h
+---
+<h1 align="center">AniGen: Unified S<sup>3</sup> Fields for Animatable 3D Asset Generation</h1>
+<p align="center"><a href="https://arxiv.org/pdf/2604.08746"><img src='https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv&logoColor=white' alt='arXiv'></a>
+<a href='https://yihua7.github.io/AniGen_web/'><img src='https://img.shields.io/badge/Project_Page-Website-green?logo=googlechrome&logoColor=white' alt='Project Page'></a>
+<a href='https://huggingface.co/spaces/VAST-AI/AniGen'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Live_Demo-blue'></a>
+<a href="https://www.tripo3d.ai"><img src="https://img.shields.io/badge/Tripo-AI_3D_Workspace-orange" alt="Tripo"></a>
+</p>
+<p align="center"><img src="assets/images/teaser.png" width="100%"></p>
+<span style="font-size: 16px; font-weight: 600;">A</span><span style="font-size: 12px; font-weight: 700;">niGen</span> is a unified framework that directly generates animate-ready 3D assets conditioned on a single image. Our key insight is to represent shape, skeleton, and skinning as mutually consistent *$S^3$ Fields* (Shape, Skeleton, Skin) defined over a shared spatial domain. To enable the robust learning of these fields, we introduce two technical innovations: (i) a *confidence-decaying skeleton field* that explicitly handles the geometric ambiguity of bone prediction at Voronoi boundaries, and (ii) a *dual skin feature field* that decouples skinning weights from specific joint counts, allowing a fixed-architecture network to predict rigs of arbitrary complexity. Built upon a two-stage flow-matching pipeline, <span style="font-size: 16px; font-weight: 600;">A</span><span style="font-size: 12px; font-weight: 700;">niGen</span> first synthesizes a sparse structural scaffold and then generates dense geometry and articulation in a structured latent space. Extensive experiments demonstrate that <span style="font-size: 16px; font-weight: 600;">A</span><span style="font-size: 12px; font-weight: 700;">niGen</span> substantially outperforms state-of-the-art sequential baselines in rig validity and animation quality, generalizing effectively to in-the-wild images across diverse categories including animals, humanoids, and machinery.
+<!-- Overview -->
+## 🔮 Overview
+AniGen takes a **single image** as input and automatically produces a fully rigged, animate-ready 3D asset, complete with a coherent mesh, an articulated skeleton, and smooth skinning weights. The generated assets can be directly imported into standard 3D pipelines and driven by off-the-shelf motion data, enabling immediate deployment across a wide spectrum of downstream applications, including **embodied AI** agent construction, **physics-based simulation**, **character animation**, **dynamic scene creation**, and **articulated object manipulation**.
+<table width="100%">
+<tr>
+<td width="25%" align="center"><img src="assets/gifs/machine_arm.gif" width="100%"><br><b>Machine Arm</b></td>
+<td width="25%" align="center"><img src="assets/gifs/machine_dog.gif" width="100%"><br><b>Machine Dog</b></td>
+<td width="25%" align="center"><img src="assets/gifs/money_tree.gif" width="100%"><br><b>Money Tree</b></td>
+<td width="25%" align="center"><img src="assets/gifs/iron_boy.gif" width="100%"><br><b>Iron Boy</b></td>
+</tr>
+<tr>
+<td width="25%" align="center"><img src="assets/gifs/mairo.gif" width="100%"><br><b>Mairo</b></td>
+<td width="25%" align="center"><img src="assets/gifs/evo.gif" width="100%"><br><b>Evo</b></td>
+<td width="25%" align="center"><img src="assets/gifs/horse.gif" width="100%"><br><b>Horse</b></td>
+<td width="25%" align="center"><img src="assets/gifs/eagle.gif" width="100%"><br><b>Eagle</b></td>
+</tr>
+</table>
+<!-- Installation -->
+## 📦 Installation
+### Prerequisites
+- **System**: The code is currently tested only on **Linux**.
+- **Hardware**: An NVIDIA GPU with at least 18GB of memory is necessary. The code has been verified on NVIDIA A800 and RTX3090 GPUs.
+- **Software**:
+  - The [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive) is needed to compile certain submodules. The code has been tested with CUDA versions 11.8 and 12.2.
+  - [Conda](https://docs.anaconda.com/miniconda/install/#quick-command-line-install) is recommended for managing dependencies.
+  - Python version 3.8 or higher is required.
+### Installation Steps
+1. Clone the repo:
+    ```sh
+    git clone --recurse-submodules https://github.com/VAST-AI-Research/AniGen.git
+    cd AniGen
+    ```
+2. Install the dependencies:
+    We recommend using [uv](https://docs.astral.sh/uv/) for fast, reliable installs. The setup script will also work with plain `pip` if `uv` is not available.
+    Create a new virtual environment and install everything:
+    ```sh
+    source ./setup.sh --new-env --all
+    ```
+    If your network connection to PyPI is unstable or slow, you can use the Tsinghua mirror:
+    ```sh
+    source ./setup.sh --new-env --all --tsinghua
+    ```
+    If you already have an environment with PyTorch installed, install into it directly:
+    ```sh
+    source ./setup.sh --basic
+    ```
+    > [!NOTE]
+    > The setup script auto-detects your CUDA version and installs matching wheels for PyTorch, spconv, pytorch3d, and nvdiffrast. [DSINE](https://github.com/baegwangbin/DSINE) (used for surface normal estimation) is loaded at runtime via `torch.hub` and does not require separate installation.
+<!-- Pretrained Models -->
+## 🤖 Pretrained Models
+We provide the following pretrained models on [Hugging Face](https://huggingface.co/VAST-AI/AniGen/tree/main). Please make sure to download all necessary weights from this page, including the required dinov2, dsine, and vgg checkpoints.
+> [!TIP]
+> **Recommended:** Use **SS-Flow-Duet** + **SLAT-Flow-Auto** if you do not have specific requirements.
+> - For more detailed skeleton (including character fingers) → **SS-Flow-Duet**
+> - For better geometry generalization → **SS-Flow-Solo**
+> - **SLAT-Flow-Control** supports density levels 0–4, but if the density condition significantly deviates from the proper value for the object, skinning weights may be damaged.
+| DAE Model | Description | Download |
+| --- | --- | --- |
+| SS-DAE | Encoder&Decoder of SS | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/ss_dae) |
+| SLAT-DAE | Encoder&Decoder of SLAT | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/slat_dae) |
+| SS Model | Description | Download |
+| --- | --- | --- |
+| SS-Flow-Duet | Detailed Skeleton (Full-FT Geo) | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/ss_flow_duet) |
+| SS-Flow-Epic | Geometry&Skeleton Balanced (LoRA-FT Geo) | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/ss_flow_epic) |
+| SS-Flow-Solo | Accurate Geometry (Freeze Geo) | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/ss_flow_solo) |
+| SLAT Model | Description | Download |
+| --- | --- | --- |
+| SLAT-Flow-Auto | Automatically Determine Joint Number | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/slat_flow_auto) |
+| SLAT-Flow-Control | Controllable Joint Density | [Download](https://huggingface.co/VAST-AI/AniGen/tree/main/ckpts/anigen/slat_flow_control) |
+<!-- Usage -->
+## 💡 Usage
+### Minimal Example
+Here is an [example](example.py) of how to use the pretrained models for 3D asset generation.
+After running the code, you will get the following files:
+- `mesh.glb`: a rigged mesh file
+- `skeleton.glb`: a skeleton visualization file
+- `processed_image.png`: the masked image as the condition
+### AniGen Pipeline (Rigged Mesh + Skeleton)
+For AniGen checkpoints in this repo (e.g. `ckpts/anigen/ss_flow_solo` + `ckpts/anigen/slat_flow_control`), you can run:
+```sh
+python example.py --image_path assets/cond_images/trex.png
+```
+### Web Demo
+[app.py](app.py) provides a simple web demo for 3D asset generation. Since this demo is based on [Gradio](https://gradio.app/), additional dependencies are required:
+```sh
+source ./setup.sh --demo
+```
+If needed, you can also install the demo dependencies via the Tsinghua mirror:
+```sh
+source ./setup.sh --demo --tsinghua
+```
+After installing the dependencies, you can run the demo with the following command:
+```sh
+python app.py
+```
+Then, you can access the demo at the address shown in the terminal.
+***The web demo is also available on [Hugging Face Spaces](https://huggingface.co/spaces/VAST-AI/AniGen)!***
+<!-- Training -->
+## 🏋️ Training
+### Training Data
+Sample training data is available at [AniGen_sample_data](https://huggingface.co/datasets/VAST-AI/AniGen_sample_data). To prepare your own data, refer to [TRELLIS](https://github.com/microsoft/TRELLIS) and the sample data format.
+### Prerequisites
+> [!NOTE]
+> Training requires the **CUBVH** extension (`extensions/CUBVH/`), which is automatically built by `setup.sh`. It is **not** needed for inference (`app.py`, `example.py`).
+### Training Commands
+The pipeline has five stages. Later stages depend on earlier ones, so please train in order:
+```sh
+# Stage 1: Skin AutoEncoder
+python train.py --config configs/anigen_skin_ae.json --output_dir outputs/anigen_skin_ae
+# Stage 2: Sparse Structure DAE
+python train.py --config configs/ss_dae.json --output_dir outputs/ss_dae
+# Stage 3: Structured Latent DAE
+python train.py --config configs/slat_dae.json --output_dir outputs/slat_dae
+# Stage 4: SS Flow Matching (image-conditioned generation)
+python train.py --config configs/ss_flow_duet.json --output_dir outputs/ss_flow_duet
+# Stage 5: SLAT Flow Matching (image-conditioned generation)
+python train.py --config configs/slat_flow_auto.json --output_dir outputs/slat_flow_auto
+```
+### Multi-Node / Multi-GPU
+Append the following flags for distributed training across multiple machines and GPUs:
+```sh
+python train.py --config configs/<config>.json --output_dir outputs/<output> \
+    --num_nodes XX --node_rank XX --master_addr XX --master_port XX
+```
+### Model Variants
+Other SS Flow variants (`ss_flow_epic`, `ss_flow_solo`) and SLAT Flow variants (`slat_flow_control`, `slat_flow_gsn_auto`) are available under `ckpts/anigen/`. Their config files can be found at `ckpts/anigen/<variant>/config.json`.
+### Resume / Restart
+Training automatically resumes from the latest checkpoint in `--output_dir`. To start fresh, pass `--ckpt none`.
+## License
+This project's source code is released under the [MIT License](LICENSE).
+> [!IMPORTANT]
+> This repository includes third-party components with additional license restrictions. In particular, `extensions/CUBVH/` contains BVH code derived from NVIDIA's [instant-ngp](https://github.com/NVlabs/instant-ngp), which is licensed for **non-commercial / research use only**. See [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md) for details.
+## Acknowledgements
+- [TRELLIS](https://github.com/microsoft/TRELLIS) by Microsoft
+- [cuBVH](https://github.com/ashawkey/cubvh) by Jiaxiang Tang
+- [tiny-cuda-nn](https://github.com/NVlabs/tiny-cuda-nn) and [instant-ngp](https://github.com/NVlabs/instant-ngp) by Thomas Müller / NVIDIA
+- [FlexiCubes](https://github.com/nv-tlabs/FlexiCubes) by NVIDIA
+We sincerely appreciate the contributions of these excellent projects and their authors. We believe open source helps accelerate research, lower barriers to innovation, and make progress more accessible to the broader community.
+<!-- Citation -->
+## 📜 Citation
+If you find this work helpful, please consider citing our paper:
+```bibtex
+@article{huang2026anigen,
+  title     = {AniGen: Unified $S^3$ Fields for Animatable 3D Asset Generation},
+  author    = {Huang, Yi-Hua and Zhou, Zi-Xin and He, Yuting and Chang, Chirui
+               and Pu, Cheng-Feng and Yang, Ziyi and Guo, Yuan-Chen
+               and Cao, Yan-Pei and Qi, Xiaojuan},
+  journal   = {ACM SIGGRAPH},
+  year      = {2026}
+}
+```

THIRD_PARTY_LICENSES.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Third-Party Licenses
+## extensions/CUBVH/ — cuBVH (CUDA Mesh BVH Acceleration)
+Originally created by [Jiaxiang Tang (ashawkey)](https://github.com/ashawkey/cubvh),
+modified by Yi-Hua Huang (yihua7).
+### MIT License (cubvh overall)
+- File: `extensions/CUBVH/LICENSE`
+- Copyright (c) 2022 Jiaxiang Tang (ashawkey)
+- Copyright (c) 2025 Yi-Hua Huang (yihua7)
+### NVIDIA Source Code License — Non-Commercial (BVH from instant-ngp)
+- File: `extensions/CUBVH/LICENSE_NVIDIA`
+- Copyright (c) 2022, NVIDIA Corporation & affiliates
+- **USE RESTRICTED TO NON-COMMERCIAL / RESEARCH PURPOSES ONLY**
+### BSD 3-Clause License (gpu_memory.h from tiny-cuda-nn)
+- File header: `extensions/CUBVH/include/gpu/gpu_memory.h`
+- Copyright (c) 2020-2022, NVIDIA CORPORATION
+### Apache License 2.0 (pcg32.h)
+- File header: `extensions/CUBVH/include/gpu/pcg32.h`
+- Author: Wenzel Jakob, modified by tiny-cuda-nn
+## anigen/representations/mesh/flexicubes/ — FlexiCubes
+- File: `anigen/representations/mesh/flexicubes/LICENSE.txt`
+- Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES
+- Apache License 2.0

anigen/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from . import models
+from . import modules
+from . import pipelines
+from . import renderers
+from . import representations
+from . import utils

anigen/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import importlib
+__attributes = {
+    'AniGenSparseStructure': 'anigen_sparse_structure',
+    'AniGenSparseFeat2Skeleton': 'anigen_sparse_feat2skeleton',
+    'AniGenSparseFeat2Render': 'anigen_sparse_feat2render',
+    'AniGenSparseStructureLatent': 'anigen_sparse_structure_latent',
+    'TextConditionedAniGenSparseStructureLatent': 'anigen_sparse_structure_latent',
+    'ImageConditionedAniGenSparseStructureLatent': 'anigen_sparse_structure_latent',
+    'AniGenSLat': 'anigen_structured_latent',
+    'AniGenTextConditionedSLat': 'anigen_structured_latent',
+    'AniGenImageConditionedSLat': 'anigen_structured_latent',
+}
+__submodules = []
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]

anigen/datasets/anigen_sparse_feat2skeleton.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+from PIL import Image
+import json
+import numpy as np
+import pandas as pd
+import torch
+import utils3d.torch
+from ..modules.sparse.basic import SparseTensor
+from .components import StandardDatasetBase
+class AniGenSparseFeat2Skeleton(StandardDatasetBase):
+    """
+    SparseFeat2Render dataset.
+    Args:
+        roots (str): paths to the dataset
+        image_size (int): size of the image
+        model (str): model name
+        resolution (int): resolution of the data
+        min_aesthetic_score (float): minimum aesthetic score
+        max_num_voxels (int): maximum number of voxels
+    """
+    def __init__(
+        self,
+        roots: str,
+        image_size: int,
+        model: str = 'dinov2_vitl14_reg',
+        resolution: int = 64,
+        min_aesthetic_score: float = 5.0,
+        max_num_voxels: int = 32768,
+        load_cubvh: bool = False,
+        skl_dilation_iter: int = 0,
+        skl_dilation_random_aug: bool = False,
+        skl_dilation_random_aug_prob: float = 0.5,
+        filter_bad_skin: bool = False,
+        test_mode: bool = True,  # Test the model performance
+        is_test: bool = False,  # Train or validation
+        skin_accum_as_flow: bool = False,  # Accumulate skin weights from bottom to top as flow-by probability
+        local_rank: int = 0,
+        joint_merge_res: int = 64,
+        **kwargs,
+    ):
+        self.image_size = image_size
+        self.model = model
+        self.resolution = resolution
+        self.min_aesthetic_score = min_aesthetic_score
+        self.max_num_voxels = max_num_voxels
+        self.value_range = (0, 1)
+        self.load_cubvh = load_cubvh
+        self.skl_dilation_iter = skl_dilation_iter
+        self.skl_dilation_random_aug = skl_dilation_random_aug
+        self.skl_dilation_random_aug_prob = skl_dilation_random_aug_prob
+        self.filter_bad_skin = filter_bad_skin
+        self.test_mode = test_mode
+        self.is_test = is_test
+        self.skin_accum_as_flow = skin_accum_as_flow
+        self.local_rank = local_rank
+        self.joint_merge_res = joint_merge_res
+        super().__init__(roots, **kwargs)
+        self.is_bad_skin_list = self.metadata['is_bad_skin'].values
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'feature_{self.model}']]
+        stats['With features'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        metadata = metadata[metadata['num_voxels'] <= self.max_num_voxels]
+        stats[f'Num voxels <= {self.max_num_voxels}'] = len(metadata)
+        if 'is_bad_skeleton' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skeleton']]
+        if self.filter_bad_skin and 'is_bad_skin' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skin']]
+        if self.test_mode:
+            metadata = metadata[metadata['is_test']] if self.is_test else metadata[~metadata['is_test']]
+        return metadata, stats
+    def _get_image(self, root, instance):
+        with open(os.path.join(root, 'renders', instance, 'transforms.json')) as f:
+            metadata = json.load(f)
+        n_views = len(metadata['frames'])
+        view = np.random.randint(n_views)
+        metadata = metadata['frames'][view]
+        fov = metadata['camera_angle_x']
+        intrinsics = utils3d.torch.intrinsics_from_fov_xy(torch.tensor(fov), torch.tensor(fov))
+        c2w = torch.tensor(metadata['transform_matrix'])
+        c2w[:3, 1:3] *= -1
+        extrinsics = torch.inverse(c2w)
+        image_path = os.path.join(root, 'renders', instance, metadata['file_path'])
+        image = Image.open(image_path)
+        alpha = image.getchannel(3)
+        image = image.convert('RGB')
+        image = image.resize((self.image_size, self.image_size), Image.Resampling.LANCZOS)
+        alpha = alpha.resize((self.image_size, self.image_size), Image.Resampling.LANCZOS)
+        image = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0
+        alpha = torch.tensor(np.array(alpha)).float() / 255.0
+        return {
+            'image': image,
+            'alpha': alpha,
+            'extrinsics': extrinsics,
+            'intrinsics': intrinsics,
+        }
+    def _get_feat(self, root, instance):
+        DATA_RESOLUTION = 64
+        feats_path = os.path.join(root, 'features', self.model, f'{instance}.npz')
+        feats_data = np.load(feats_path, allow_pickle=True)
+        coords = torch.tensor(feats_data['indices']).int()
+        feats = torch.tensor(feats_data['patchtokens']).float()
+        position = utils3d.io.read_ply(os.path.join(root, 'voxels', f'{instance}_skeleton.ply'))[0]
+        coords_skl = ((torch.tensor(position) + 0.5) * self.resolution).int().contiguous()
+        ss_skl = torch.zeros(1, self.resolution, self.resolution, self.resolution, dtype=torch.long)
+        ss_skl[0, coords_skl[:,0], coords_skl[:,1], coords_skl[:,2]] = 1
+        ss_skl_ori = ss_skl.clone()
+        if self.skl_dilation_random_aug or self.skl_dilation_iter > 0:
+            size = max(0, self.skl_dilation_iter) * 2 + 1
+            if self.skl_dilation_iter > 0:
+                kernel = torch.ones(1, 1, size, size, size, dtype=torch.float32, device=ss_skl.device)
+                ss_skl = torch.nn.functional.conv3d(ss_skl.float()[None], kernel, padding=self.skl_dilation_iter)
+                ss_skl = (ss_skl > 0).long().squeeze(0)
+                coords_skl = torch.nonzero(ss_skl[0], as_tuple=False).int()
+            if self.skl_dilation_random_aug and np.random.rand() < self.skl_dilation_random_aug_prob:
+                size_small, size_large = size - 2, size + 2
+                kernel_large = torch.ones(1, 1, size_large, size_large, size_large, dtype=torch.float32, device=ss_skl_ori.device)
+                ss_skl_large = torch.nn.functional.conv3d(ss_skl_ori.float()[None], kernel_large, padding=size_large//2)
+                ss_skl_large = (ss_skl_large > 0).long().squeeze(0)
+                if size_small > 1:
+                    kernel_small = torch.ones(1, 1, size_small, size_small, size_small, dtype=torch.float32, device=ss_skl_ori.device)
+                    ss_skl_small = torch.nn.functional.conv3d(ss_skl_ori.float()[None], kernel_small, padding=size_small//2)
+                    ss_skl_small = (ss_skl_small > 0).long().squeeze(0)
+                else:
+                    ss_skl_small = torch.zeros_like(ss_skl)
+                ss_skl_random_mask = torch.rand_like(ss_skl.float()) < 0.5
+                ss_skl = ss_skl_small * ss_skl_random_mask.long() + ss_skl_large * (1 - ss_skl_random_mask.long())
+                coords_skl = torch.nonzero(ss_skl[0], as_tuple=False).int()
+        feats_skl = torch.zeros((coords_skl.shape[0], 0), dtype=torch.float32)
+        if self.resolution != DATA_RESOLUTION:
+            factor = DATA_RESOLUTION // self.resolution
+            coords = coords // factor
+            coords, idx = coords.unique(return_inverse=True, dim=0)
+            feats = torch.scatter_reduce(
+                torch.zeros(coords.shape[0], feats.shape[1], device=feats.device),
+                dim=0,
+                index=idx.unsqueeze(-1).expand(-1, feats.shape[1]),
+                src=feats,
+                reduce='mean'
+            )
+            coords_skl = coords_skl // factor
+            coords_skl, idx = coords_skl.unique(return_inverse=True, dim=0)
+            feats_skl = torch.scatter_reduce(
+                torch.zeros(coords_skl.shape[0], feats_skl.shape[1], device=feats_skl.device),
+                dim=0,
+                index=idx.unsqueeze(-1).expand(-1, feats_skl.shape[1]),
+                src=feats_skl,
+                reduce='mean'
+            )
+        return {
+            'coords': coords,
+            'feats': feats,
+            'coords_skl': coords_skl,
+            'feats_skl': feats_skl,
+        }
+    @torch.no_grad()
+    def visualize_sample(self, sample: dict):
+        return sample['image']
+    @staticmethod
+    def collate_fn(batch):
+        pack = {}
+        coords = []
+        coords_skl = []
+        for i, b in enumerate(batch):
+            coords.append(torch.cat([torch.full((b['coords'].shape[0], 1), i, dtype=torch.int32), b['coords']], dim=-1))
+            coords_skl.append(torch.cat([torch.full((b['coords_skl'].shape[0], 1), i, dtype=torch.int32), b['coords_skl']], dim=-1))
+        coords = torch.cat(coords)
+        feats = torch.cat([b['feats'] for b in batch])
+        pack['feats'] = SparseTensor(
+            coords=coords,
+            feats=feats,
+        )
+        coords_skl = torch.cat(coords_skl)
+        feats_skl = torch.cat([b['feats_skl'] for b in batch])
+        pack['feats_skl'] = SparseTensor(
+            coords=coords_skl,
+            feats=feats_skl,
+        )
+        pack['image'] = torch.stack([b['image'] for b in batch])
+        pack['alpha'] = torch.stack([b['alpha'] for b in batch])
+        pack['extrinsics'] = torch.stack([b['extrinsics'] for b in batch])
+        pack['intrinsics'] = torch.stack([b['intrinsics'] for b in batch])
+        pack['joints'] = [b['joints'] for b in batch]
+        pack['parents'] = [b['parents'] for b in batch]
+        pack['skin'] = [b['skin'] for b in batch]
+        pack['is_bad_skin'] = [b['is_bad_skin'] for b in batch]
+        # collate other data
+        keys = [k for k in batch[0].keys() if k not in ['coords', 'feats', 'coords_skl', 'feats_skl', 'image', 'alpha', 'extrinsics', 'intrinsics', 'joints', 'parents', 'skin']]
+        for k in keys:
+            if isinstance(batch[0][k], torch.Tensor):
+                pack[k] = torch.stack([b[k] for b in batch])
+            elif isinstance(batch[0][k], list):
+                pack[k] = sum([b[k] for b in batch], [])
+            else:
+                pack[k] = [b[k] for b in batch]
+        return pack
+    def _get_geo(self, root, instance):
+        skeleton_path = os.path.join(root, 'skeleton', instance, 'skeleton_voxelized.npz')
+        skl_data = np.load(skeleton_path, allow_pickle=True)
+        verts, face = np.array(skl_data['vertices'], dtype=np.float32), skl_data['faces']
+        mesh = {
+            "vertices" : torch.from_numpy(verts),
+            "faces" : torch.from_numpy(face),
+        }
+        geo = {"mesh": mesh}
+        if self.load_cubvh:
+            from cubvh import cuBVH
+            torch.cuda.set_device(self.local_rank)
+            cubvh_path = os.path.join(root, 'skeleton', instance, 'cubvh.pth')
+            if os.path.exists(cubvh_path):
+                bvh = torch.load(cubvh_path, weights_only=False)
+                if isinstance(bvh, cuBVH):
+                    bvh = bvh.to('cpu')
+            else:
+                device = torch.device(f"cuda:{self.local_rank}")
+                bvh = cuBVH(mesh["vertices"], mesh["faces"], device=device)
+                bvh = bvh.to('cpu')
+                torch.save(bvh, cubvh_path)
+            geo["cubvh"] = bvh
+        return  geo
+    def _get_skeleton(self, root, instance):
+        skeleton_path = os.path.join(root, 'skeleton', instance, 'skeleton_voxelized.npz')
+        skl_data = np.load(skeleton_path, allow_pickle=True)
+        joints, parents, skin = skl_data['joints'], skl_data['parents'], skl_data['skin']
+        parents[parents==None] = -1
+        parents = np.array(parents, dtype=np.int32)
+        skin[np.where(skl_data['skin'].max(axis=1)==0)[0], 0] = 1.0
+        skin = skin / skin.sum(-1, keepdims=True)
+        if self.skin_accum_as_flow:
+            root_idx = np.where(parents == -1)[0][0]
+            def sum_children(joint_idx, skin_weights):
+                children = np.where(parents == joint_idx)[0]
+                for child in children:
+                    skin_weights[:, joint_idx] += sum_children(child, skin_weights)
+                return skin_weights[:, joint_idx]
+            sum_children(root_idx, skin)
+            skin = np.clip(skin, 0, 1)
+        is_bad_skin = self.metadata['is_bad_skin'][instance]
+        return {
+            'joints': torch.from_numpy(joints).float(),
+            'parents': torch.from_numpy(parents).int(),
+            'skin': torch.from_numpy(skin).float(),
+            'is_bad_skin': is_bad_skin
+        }
+    def get_instance(self, root, instance):
+        image = self._get_image(root, instance)
+        feat = self._get_feat(root, instance)
+        geo = self._get_geo(root, instance)
+        skl = self._get_skeleton(root, instance)
+        return {
+            **image,
+            **feat,
+            **geo,
+            **skl,
+            'instance': instance,
+        }

anigen/datasets/anigen_sparse_structure.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import json
+from typing import Union
+import numpy as np
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+import utils3d
+from .components import StandardDatasetBase
+from ..representations.octree import DfsOctree as Octree
+from ..renderers import OctreeRenderer
+class AniGenSparseStructure(StandardDatasetBase):
+    """
+    Sparse structure dataset
+    Args:
+        roots (str): path to the dataset
+        resolution (int): resolution of the voxel grid
+        min_aesthetic_score (float): minimum aesthetic score of the instances to be included in the dataset
+    """
+    def __init__(self,
+        roots,
+        resolution: int = 64,
+        min_aesthetic_score: float = 5.0,
+        skl_dilation_iter: int = 0,
+        **kwargs,
+    ):
+        self.resolution = resolution
+        self.min_aesthetic_score = min_aesthetic_score
+        self.skl_dilation_iter = skl_dilation_iter
+        self.value_range = (0, 1)
+        super().__init__(roots)
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'voxelized']]
+        stats['Voxelized'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        return metadata, stats
+    def get_ply_instance(self, root, instance, dilation_iter=None):
+        if dilation_iter is None:
+            dilation_iter = self.skl_dilation_iter
+        position = utils3d.io.read_ply(os.path.join(root, 'voxels', f'{instance}.ply'))[0]
+        coords = ((torch.tensor(position) + 0.5) * self.resolution).int().contiguous()
+        ss = torch.zeros(1, self.resolution, self.resolution, self.resolution, dtype=torch.long)
+        ss[:, coords[:, 0], coords[:, 1], coords[:, 2]] = 1
+        if dilation_iter > 0:
+            # 3D Dilation
+            size = dilation_iter * 2 + 1
+            kernel = torch.ones(1, 1, size, size, size, dtype=torch.float32, device=ss.device)
+            ss = torch.nn.functional.conv3d(ss.float()[None], kernel, padding=dilation_iter)
+            ss = (ss > 0).long().squeeze(0)
+        return ss
+    def get_instance(self, root, instance):
+        ss = self.get_ply_instance(root, instance, dilation_iter=0)
+        ss_skl = self.get_ply_instance(root, f'{instance}_skeleton', dilation_iter=self.skl_dilation_iter)
+        return {'ss': ss, 'ss_skl': ss_skl, 'instance': instance}
+    @torch.no_grad()
+    def visualize_sample(self, ss: Union[torch.Tensor, dict]):
+        ss = ss if isinstance(ss, torch.Tensor) else ss['ss']
+        renderer = OctreeRenderer()
+        renderer.rendering_options.resolution = 512
+        renderer.rendering_options.near = 0.8
+        renderer.rendering_options.far = 1.6
+        renderer.rendering_options.bg_color = (0, 0, 0)
+        renderer.rendering_options.ssaa = 4
+        renderer.pipe.primitive = 'voxel'
+        # Build camera
+        yaws = [0, np.pi / 2, np.pi, 3 * np.pi / 2]
+        yaws_offset = 0. # np.random.uniform(-np.pi / 4, np.pi / 4)
+        yaws = [y + yaws_offset for y in yaws]
+        pitch = np.linspace(-np.pi / 4, np.pi / 4, num=4)  # [np.random.uniform(-np.pi / 4, np.pi / 4) for _ in range(4)]
+        exts = []
+        ints = []
+        for yaw, pitch in zip(yaws, pitch):
+            orig = torch.tensor([
+                np.sin(yaw) * np.cos(pitch),
+                np.cos(yaw) * np.cos(pitch),
+                np.sin(pitch),
+            ]).float().cuda() * 2
+            fov = torch.deg2rad(torch.tensor(30)).cuda()
+            extrinsics = utils3d.torch.extrinsics_look_at(orig, torch.tensor([0, 0, 0]).float().cuda(), torch.tensor([0, 0, 1]).float().cuda())
+            intrinsics = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
+            exts.append(extrinsics)
+            ints.append(intrinsics)
+        images = []
+        # Build each representation
+        ss = ss.cuda()
+        for i in range(ss.shape[0]):
+            representation = Octree(
+                depth=10,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                device='cuda',
+                primitive='voxel',
+                sh_degree=0,
+                primitive_config={'solid': True},
+            )
+            coords = torch.nonzero(ss[i, 0], as_tuple=False)
+            representation.position = coords.float() / self.resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(self.resolution)), dtype=torch.uint8, device='cuda')
+            image = torch.zeros(3, 1024, 1024).cuda()
+            tile = [2, 2]
+            for j, (ext, intr) in enumerate(zip(exts, ints)):
+                res = renderer.render(representation, ext, intr, colors_overwrite=representation.position)
+                image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
+            images.append(image)
+        return torch.stack(images)

anigen/datasets/anigen_sparse_structure_latent.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import json
+from typing import *
+import numpy as np
+import torch
+import utils3d
+from ..representations.octree import DfsOctree as Octree
+from ..renderers import OctreeRenderer
+from .components import StandardDatasetBase, TextConditionedMixin, ImageConditionedMixin
+from .. import models
+from ..utils.dist_utils import read_file_dist
+import torch.nn.functional as F
+class AniGenSparseStructureLatentVisMixin:
+    def __init__(
+        self,
+        *args,
+        pretrained_ss_dec: str = None,
+        ss_dec_path: Optional[str] = '',
+        ss_dec_ckpt: Optional[str] = 'final',
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.ss_dec = None
+        self.pretrained_ss_dec = pretrained_ss_dec
+        self.ss_dec_path = ss_dec_path
+        self.ss_dec_ckpt = ss_dec_ckpt
+    def _loading_ss_dec(self):
+        if self.ss_dec is not None:
+            return
+        if self.ss_dec_path is not None:
+            cfg = json.load(open(os.path.join(self.ss_dec_path, 'config.json'), 'r'))
+            decoder = getattr(models, cfg['models']['decoder']['name'])(**cfg['models']['decoder']['args'])
+            ckpt_path = os.path.join(self.ss_dec_path, 'ckpts', f'decoder_{self.ss_dec_ckpt}.pt')
+            decoder.load_state_dict(torch.load(ckpt_path, map_location='cpu', weights_only=True))
+            # decoder.load_state_dict(torch.load(read_file_dist(ckpt_path), map_location='cpu', weights_only=True))  # Got stuck...
+        else:
+            decoder = models.from_pretrained(self.pretrained_ss_dec)
+        self.ss_dec = decoder.cuda().eval()
+    def _delete_ss_dec(self):
+        del self.ss_dec
+        self.ss_dec = None
+    @torch.no_grad()
+    def decode_latent(self, z, z_skl, batch_size=4):
+        self._loading_ss_dec()
+        ss = []
+        ss_skl = []
+        if self.normalization is not None:
+            z = z * self.std.to(z.device) + self.mean.to(z.device)
+        if self.normalization_skl is not None:
+            z_skl = z_skl * self.std_skl.to(z_skl.device) + self.mean_skl.to(z_skl.device)
+        for i in range(0, z.shape[0], batch_size):
+            z_, z_skl_ = z[i:i+batch_size], z_skl[i:i+batch_size]
+            ss_, ss_skl_ = self.ss_dec(z_, z_skl_)
+            ss.append(ss_)
+            ss_skl.append(ss_skl_)
+        ss = torch.cat(ss, dim=0)
+        ss_skl = torch.cat(ss_skl, dim=0)
+        self._delete_ss_dec()
+        return ss, ss_skl
+    @torch.no_grad()
+    def visualize_sample(self, x_0: Union[torch.Tensor, dict], x_0_skl: Optional[Union[torch.Tensor, dict]]=None, **kwargs):
+        x_0_skl = x_0_skl if isinstance(x_0, torch.Tensor) else x_0['x_0_skl']
+        x_0 = x_0 if isinstance(x_0, torch.Tensor) else x_0['x_0']
+        x_0, x_0_skl = self.decode_latent(x_0.cuda(), x_0_skl.cuda())
+        renderer = OctreeRenderer()
+        renderer.rendering_options.resolution = 512
+        renderer.rendering_options.near = 0.8
+        renderer.rendering_options.far = 1.6
+        renderer.rendering_options.bg_color = (0, 0, 0)
+        renderer.rendering_options.ssaa = 4
+        renderer.pipe.primitive = 'voxel'
+        # Build camera
+        yaws = [0, np.pi / 2, np.pi, 3 * np.pi / 2]
+        yaws_offset =  0  # np.random.uniform(-np.pi / 4, np.pi / 4)
+        yaws = [y + yaws_offset for y in yaws]
+        pitch = np.linspace(-np.pi / 4, np.pi / 4, 4) # [np.random.uniform(-np.pi / 4, np.pi / 4) for _ in range(4)]
+        exts = []
+        ints = []
+        for yaw, pitch in zip(yaws, pitch):
+            orig = torch.tensor([
+                np.sin(yaw) * np.cos(pitch),
+                np.cos(yaw) * np.cos(pitch),
+                np.sin(pitch),
+            ]).float().cuda() * 2
+            fov = torch.deg2rad(torch.tensor(30)).cuda()
+            extrinsics = utils3d.torch.extrinsics_look_at(orig, torch.tensor([0, 0, 0]).float().cuda(), torch.tensor([0, 0, 1]).float().cuda())
+            intrinsics = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
+            exts.append(extrinsics)
+            ints.append(intrinsics)
+        images = []
+        x_0 = x_0.cuda()
+        for i in range(x_0.shape[0]):
+            representation = Octree(
+                depth=10,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                device='cuda',
+                primitive='voxel',
+                sh_degree=0,
+                primitive_config={'solid': True},
+            )
+            coords = torch.nonzero(x_0[i, 0] > 0, as_tuple=False)
+            resolution = x_0.shape[-1]
+            representation.position = coords.float() / resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(resolution)), dtype=torch.uint8, device='cuda')
+            image = torch.zeros(3, 1024, 1024).cuda()
+            tile = [2, 2]
+            for j, (ext, intr) in enumerate(zip(exts, ints)):
+                res = renderer.render(representation, ext, intr, colors_overwrite=representation.position)
+                image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
+            images.append(image)
+        x_0_skl = x_0_skl.cuda()
+        for i in range(x_0_skl.shape[0]):
+            representation = Octree(
+                depth=10,
+                aabb=[-0.5, -0.5, -0.5, 1, 1, 1],
+                device='cuda',
+                primitive='voxel',
+                sh_degree=0,
+                primitive_config={'solid': True},
+            )
+            coords = torch.nonzero(x_0_skl[i, 0] > 0, as_tuple=False)
+            resolution = x_0_skl.shape[-1]
+            representation.position = coords.float() / resolution
+            representation.depth = torch.full((representation.position.shape[0], 1), int(np.log2(resolution)), dtype=torch.uint8, device='cuda')
+            image = torch.zeros(3, 1024, 1024).cuda()
+            tile = [2, 2]
+            for j, (ext, intr) in enumerate(zip(exts, ints)):
+                res = renderer.render(representation, ext, intr, colors_overwrite=representation.position)
+                image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
+            images[i] = torch.cat([images[i], image], dim=2)
+        return torch.stack(images)
+class AniGenSparseStructureLatent(AniGenSparseStructureLatentVisMixin, StandardDatasetBase):
+    """
+    Sparse structure latent dataset
+    Args:
+        roots (str): path to the dataset
+        latent_model (str): name of the latent model
+        min_aesthetic_score (float): minimum aesthetic score
+        normalization (dict): normalization stats
+        pretrained_ss_dec (str): name of the pretrained sparse structure decoder
+        ss_dec_path (str): path to the sparse structure decoder, if given, will override the pretrained_ss_dec
+        ss_dec_ckpt (str): name of the sparse structure decoder checkpoint
+    """
+    def __init__(self,
+        roots: str,
+        *,
+        latent_model: str,
+        min_aesthetic_score: float = 5.0,
+        normalization: Optional[dict] = None,
+        normalization_skl: Optional[dict] = None,
+        pretrained_ss_dec: str = None,
+        ss_dec_path: Optional[str] = '',
+        ss_dec_ckpt: Optional[str] = 'final',
+        **kwargs,
+    ):
+        self.latent_model = latent_model
+        self.min_aesthetic_score = min_aesthetic_score
+        self.normalization = normalization
+        self.normalization_skl = normalization_skl
+        self.value_range = (0, 1)
+        super().__init__(
+            roots,
+            pretrained_ss_dec=pretrained_ss_dec,
+            ss_dec_path=ss_dec_path,
+            ss_dec_ckpt=ss_dec_ckpt,
+            **kwargs,
+        )
+        if self.normalization is not None:
+            data = np.load(self.normalization)
+            self.mean = torch.tensor(data['feats_mean'])
+            self.std = torch.tensor(data['feats_std'])
+        if self.normalization_skl is not None:
+            data = np.load(self.normalization_skl)
+            self.mean_skl = torch.tensor(data['feats_skl_mean'])
+            self.std_skl = torch.tensor(data['feats_skl_std']).clip(min=1e-3)
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'ss_latent_{self.latent_model}']]
+        stats['With sparse structure latents'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        if 'is_bad_skeleton' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skeleton']]
+        if 'is_bad_skin' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skin']]
+        return metadata, stats
+    def get_instance(self, root, instance):
+        latent = np.load(os.path.join(root, 'ss_latents', self.latent_model, f'{instance}.npz'))
+        z = torch.tensor(latent['mean']).float()
+        z_skl = torch.tensor(latent['mean_skl']).float()
+        if self.normalization is not None:
+            z = (z - self.mean) / self.std
+        if self.normalization_skl is not None:
+            z_skl = (z_skl - self.mean_skl) / self.std_skl
+        pack = {
+            'instance': instance,
+            'x_0': z,
+            'x_0_skl': z_skl,
+        }
+        return pack
+class TextConditionedAniGenSparseStructureLatent(TextConditionedMixin, AniGenSparseStructureLatent):
+    """
+    Text-conditioned sparse structure dataset
+    """
+    pass
+class ImageConditionedAniGenSparseStructureLatent(ImageConditionedMixin, AniGenSparseStructureLatent):
+    """
+    Image-conditioned sparse structure dataset
+    """
+    pass

anigen/datasets/anigen_structured_latent.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import json
+import os
+from typing import *
+import numpy as np
+import torch
+import utils3d.torch
+from .components import StandardDatasetBase, TextConditionedMixin, ImageConditionedMixin
+from ..modules.sparse.basic import SparseTensor
+from .. import models
+from ..utils.render_utils import get_renderer
+from ..utils.dist_utils import read_file_dist
+from ..utils.data_utils import load_balanced_group_indices
+import copy
+import torch.nn.functional as F
+class AniGenSLatVisMixin:
+    def __init__(
+        self,
+        *args,
+        pretrained_slat_dec: str = None,
+        slat_dec_path: Optional[str] = None,
+        slat_dec_ckpt: Optional[str] = None,
+        load_cubvh: bool = False,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.slat_dec = None
+        self.pretrained_slat_dec = pretrained_slat_dec
+        self.slat_dec_path = slat_dec_path
+        self.slat_dec_ckpt = slat_dec_ckpt
+        self.load_cubvh = load_cubvh
+    def _loading_slat_dec(self):
+        if self.slat_dec is not None:
+            return
+        if self.slat_dec_path is not None:
+            cfg = json.load(open(os.path.join(self.slat_dec_path, 'config.json'), 'r'))
+            decoder = getattr(models, cfg['models']['decoder']['name'])(**cfg['models']['decoder']['args'])
+            ckpt_path = os.path.join(self.slat_dec_path, 'ckpts', f'decoder_{self.slat_dec_ckpt}.pt')
+            # decoder.load_state_dict(torch.load(read_file_dist(ckpt_path), map_location='cpu', weights_only=True))
+            decoder.load_state_dict(torch.load(ckpt_path, map_location='cpu', weights_only=True))
+        else:
+            decoder = models.from_pretrained(self.pretrained_slat_dec)
+        self.slat_dec = decoder.cuda().eval()
+    def _delete_slat_dec(self):
+        del self.slat_dec
+        self.slat_dec = None
+    @torch.no_grad()
+    def decode_latent(self, z, z_skl, gt_joints=None, gt_parents=None, batch_size=4, gt_reps=None, gt_reps_skl=None):
+        self._loading_slat_dec()
+        reps = []
+        reps_skl = []
+        if gt_reps is not None:
+            skins_gt_ssskin = []
+        if gt_reps_skl is not None:
+            skins_gt_sklskin = []
+        if self.normalization is not None:
+            z = z * self.std.to(z.device) + self.mean.to(z.device)
+            z_skl = z_skl * self.std_skl.to(z.device) + self.mean_skl.to(z.device)
+        for i in range(0, z.shape[0], batch_size):
+            gt_j, gt_p = None if gt_joints is None else gt_joints[i:i+batch_size], None if gt_parents is None else gt_parents[i:i+batch_size]
+            z_, z_skl_ = z[i:i+batch_size], z_skl[i:i+batch_size]
+            rep, rep_skl = self.slat_dec(z_, z_skl_, gt_joints=gt_j, gt_parents=gt_p)
+            reps.append(rep)
+            reps_skl.append(rep_skl)
+            if gt_reps is not None:
+                skins_gt_ssskin.append(self.slat_dec.skinweight_forward(gt_reps[i:i+batch_size], rep_skl, gt_joints=gt_j, gt_parents=gt_p, return_skin_pred_only=True))
+            if gt_reps_skl is not None:
+                skins_gt_sklskin.append(self.slat_dec.skinweight_forward(rep, gt_reps_skl[i:i+batch_size], gt_joints=gt_j, gt_parents=gt_p, return_skin_pred_only=True))
+        reps = sum(reps, [])
+        reps_skl = sum(reps_skl, [])
+        self._delete_slat_dec()
+        to_return = (reps, reps_skl)
+        if gt_reps is not None:
+            skins_gt_ssskin = sum(skins_gt_ssskin, [])
+            to_return += (skins_gt_ssskin,)
+        if gt_reps_skl is not None:
+            skins_gt_sklskin = sum(skins_gt_sklskin, [])
+            to_return += (skins_gt_sklskin,)
+        return to_return
+class AniGenSLat(AniGenSLatVisMixin, StandardDatasetBase):
+    """
+    structured latent dataset
+    Args:
+        roots (str): path to the dataset
+        latent_model (str): name of the latent model
+        min_aesthetic_score (float): minimum aesthetic score
+        max_num_voxels (int): maximum number of voxels
+        normalization (dict): normalization stats
+        pretrained_slat_dec (str): name of the pretrained slat decoder
+        slat_dec_path (str): path to the slat decoder, if given, will override the pretrained_slat_dec
+        slat_dec_ckpt (str): name of the slat decoder checkpoint
+    """
+    def __init__(self,
+        roots: str,
+        *,
+        latent_model: str,
+        use_joint_num_cond: bool = False,
+        min_aesthetic_score: float = 5.0,
+        max_num_voxels: int = 32768,
+        normalization: Optional[dict] = None,
+        pretrained_slat_dec: str = None,
+        slat_dec_path: Optional[str] = None,
+        slat_dec_ckpt: Optional[str] = None,
+        local_rank: int = 0,
+        **kwargs,
+    ):
+        self.normalization = normalization
+        self.latent_model = latent_model
+        self.use_joint_num_cond = use_joint_num_cond
+        self.min_aesthetic_score = min_aesthetic_score
+        self.max_num_voxels = max_num_voxels
+        self.value_range = (0, 1)
+        self.local_rank = local_rank
+        super().__init__(
+            roots,
+            pretrained_slat_dec=pretrained_slat_dec,
+            slat_dec_path=slat_dec_path,
+            slat_dec_ckpt=slat_dec_ckpt,
+            **kwargs,
+        )
+        self.loads = [self.metadata.loc[sha256, 'num_voxels'] for _, sha256 in self.instances]
+        if self.normalization is not None:
+            self.mean = torch.tensor(self.normalization['mean']).reshape(1, -1)
+            self.std = torch.tensor(self.normalization['std']).reshape(1, -1)
+            self.mean_skl = torch.tensor(self.normalization['mean_skl']).reshape(1, -1)
+            self.std_skl = torch.tensor(self.normalization['std_skl']).reshape(1, -1)
+    def filter_metadata(self, metadata):
+        stats = {}
+        metadata = metadata[metadata[f'latent_{self.latent_model}']]
+        stats['With latent'] = len(metadata)
+        metadata = metadata[metadata['aesthetic_score'] >= self.min_aesthetic_score]
+        stats[f'Aesthetic score >= {self.min_aesthetic_score}'] = len(metadata)
+        # metadata = metadata[metadata['num_voxels'] <= self.max_num_voxels]
+        # stats[f'Num voxels <= {self.max_num_voxels}'] = len(metadata)
+        if 'is_bad_skeleton' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skeleton']]
+        if 'is_bad_skin' in metadata.columns:
+            metadata = metadata[~metadata['is_bad_skin']]
+        return metadata, stats
+    @torch.no_grad()
+    def visualize_sample(self, data: dict):
+        return {}
+        x_0 = data['x_0']
+        x_0_skl = data['x_0_skl']
+        reps, reps_skl = self.decode_latent(x_0.cuda(), x_0_skl.cuda(), data['joints'])
+        # Build camera
+        yaws = [0, np.pi / 2, np.pi, 3 * np.pi / 2]
+        yaws_offset = np.random.uniform(-np.pi / 4, np.pi / 4)
+        yaws = [y + yaws_offset for y in yaws]
+        pitch = [np.random.uniform(-np.pi / 4, np.pi / 4) for _ in range(4)]
+        exts = []
+        ints = []
+        for yaw, pitch in zip(yaws, pitch):
+            orig = torch.tensor([
+                np.sin(yaw) * np.cos(pitch),
+                np.cos(yaw) * np.cos(pitch),
+                np.sin(pitch),
+            ]).float().cuda() * 2
+            fov = torch.deg2rad(torch.tensor(40)).cuda()
+            extrinsics = utils3d.torch.extrinsics_look_at(orig, torch.tensor([0, 0, 0]).float().cuda(), torch.tensor([0, 0, 1]).float().cuda())
+            intrinsics = utils3d.torch.intrinsics_from_fov_xy(fov, fov)
+            exts.append(extrinsics)
+            ints.append(intrinsics)
+        renderer = get_renderer(reps[0])
+        images = []
+        for representation in reps:
+            image = torch.zeros(3, 1024, 1024).cuda()
+            tile = [2, 2]
+            for j, (ext, intr) in enumerate(zip(exts, ints)):
+                res = renderer.render(representation, ext, intr)
+                image[:, 512 * (j // tile[1]):512 * (j // tile[1] + 1), 512 * (j % tile[1]):512 * (j % tile[1] + 1)] = res['color']
+            images.append(image)
+        images = torch.stack(images)
+        return images
+    def _get_skeleton(self, root, instance):
+        skeleton_path = os.path.join(root, 'skeleton', instance, 'skeleton_voxelized.npz')
+        skl_data = np.load(skeleton_path, allow_pickle=True)
+        joints, parents, skin = skl_data['joints'], skl_data['parents'], skl_data['skin']
+        parents[parents==None] = -1
+        parents = np.array(parents, dtype=np.int32)
+        ret = {
+            'joints': torch.from_numpy(joints).float(),
+            'parents': torch.from_numpy(parents).int(),
+            'skin': torch.from_numpy(skin).float(),
+        }
+        if self.use_joint_num_cond:
+            ret['joints_num'] = int(joints.shape[0])
+        return ret
+    def _get_geo(self, root, instance):
+        skeleton_path = os.path.join(root, 'skeleton', instance, 'skeleton_voxelized.npz')
+        skl_data = np.load(skeleton_path, allow_pickle=True)
+        verts, face = np.array(skl_data['vertices'], dtype=np.float32), skl_data['faces']
+        mesh = {
+            "vertices" : torch.from_numpy(verts),
+            "faces" : torch.from_numpy(face),
+        }
+        geo = {"mesh": mesh}
+        if self.load_cubvh:
+            from cubvh import cuBVH
+            torch.cuda.set_device(self.local_rank)
+            cubvh_path = os.path.join(root, 'skeleton', instance, 'cubvh.pth')
+            if os.path.exists(cubvh_path):
+                cubvh = torch.load(cubvh_path, weights_only=False)
+            else:
+                cubvh = cuBVH(mesh["vertices"], mesh["faces"])
+                torch.save(cubvh, cubvh_path)
+            geo["cubvh"] = cubvh
+        return  geo
+    def get_instance(self, root, instance):
+        data = np.load(os.path.join(root, 'latents', self.latent_model, f'{instance}.npz'))
+        coords = torch.tensor(data['coords']).int()
+        feats = torch.tensor(data['feats']).float()
+        coords_skl = torch.tensor(data['coords_skl']).int()
+        feats_skl = torch.tensor(data['feats_skl']).float()
+        if self.normalization is not None:
+            feats = (feats - self.mean) / self.std
+            feats_skl = (feats_skl - self.mean_skl) / self.std_skl
+        return {
+            'coords': coords,
+            'feats': feats,
+            'coords_skl': coords_skl,
+            'feats_skl': feats_skl,
+            'instance': instance,
+            **self._get_skeleton(root, instance),
+            **self._get_geo(root, instance),
+        }
+    @staticmethod
+    def collate_fn(batch, split_size=None):
+        if split_size is None:
+            group_idx = [list(range(len(batch)))]
+        else:
+            group_idx = load_balanced_group_indices([b['coords'].shape[0] for b in batch], split_size)
+        packs = []
+        for group in group_idx:
+            sub_batch = [batch[i] for i in group]
+            pack = {}
+            coords = []
+            feats = []
+            coords_skl = []
+            feats_skl = []
+            layout = []
+            layout_skl = []
+            start = 0
+            start_skl = 0
+            for i, b in enumerate(sub_batch):
+                coords.append(torch.cat([torch.full((b['coords'].shape[0], 1), i, dtype=torch.int32), b['coords']], dim=-1))
+                feats.append(b['feats'])
+                coords_skl.append(torch.cat([torch.full((b['coords_skl'].shape[0], 1), i, dtype=torch.int32), b['coords_skl']], dim=-1))
+                feats_skl.append(b['feats_skl'])
+                layout.append(slice(start, start + b['coords'].shape[0]))
+                layout_skl.append(slice(start_skl, start_skl + b['coords_skl'].shape[0]))
+                start += b['coords'].shape[0]
+                start_skl += b['coords_skl'].shape[0]
+            coords = torch.cat(coords)
+            feats = torch.cat(feats)
+            pack['x_0'] = SparseTensor(
+                coords=coords,
+                feats=feats,
+            )
+            pack['x_0']._shape = torch.Size([len(group), *sub_batch[0]['feats'].shape[1:]])
+            pack['x_0'].register_spatial_cache('layout', layout)
+            coords_skl = torch.cat(coords_skl)
+            feats_skl = torch.cat(feats_skl)
+            pack['x_0_skl'] = SparseTensor(
+                coords=coords_skl,
+                feats=feats_skl,
+            )
+            pack['x_0_skl']._shape = torch.Size([len(group), *sub_batch[0]['feats_skl'].shape[1:]])
+            pack['x_0_skl'].register_spatial_cache('layout', layout_skl)
+            pack['joints'] = [b['joints'] for b in sub_batch]
+            pack['parents'] = [b['parents'] for b in sub_batch]
+            pack['skin'] = [b['skin'] for b in sub_batch]
+            if 'joints_num' in sub_batch[0]:
+                pack['joints_num'] = torch.tensor([b['joints_num'] for b in sub_batch], dtype=torch.long)
+            # collate other data
+            keys = [k for k in sub_batch[0].keys() if k not in ['coords', 'feats', 'coords_skl', 'feats_skl', 'joints', 'parents', 'skin', 'joints_num']]
+            for k in keys:
+                if isinstance(sub_batch[0][k], torch.Tensor):
+                    pack[k] = torch.stack([b[k] for b in sub_batch])
+                elif isinstance(sub_batch[0][k], list):
+                    pack[k] = sum([b[k] for b in sub_batch], [])
+                else:
+                    pack[k] = [b[k] for b in sub_batch]
+            packs.append(pack)
+        if split_size is None:
+            return packs[0]
+        return packs
+class TextConditionedSLat(TextConditionedMixin, AniGenSLat):
+    """
+    Text conditioned structured latent dataset
+    """
+    pass
+class AniGenImageConditionedSLat(ImageConditionedMixin, AniGenSLat):
+    """
+    Image conditioned structured latent dataset
+    """
+    pass

anigen/datasets/components.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from typing import *
+from abc import abstractmethod
+import os
+import json
+import torch
+import numpy as np
+import pandas as pd
+from PIL import Image
+from torch.utils.data import Dataset
+class StandardDatasetBase(Dataset):
+    """
+    Base class for standard datasets.
+    Args:
+        roots (str): paths to the dataset
+    """
+    def __init__(self,
+        roots: str,
+        instances: List[str] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.roots = roots.split(',')
+        self.instances = []
+        self.metadata = pd.DataFrame()
+        self._stats = {}
+        for root in self.roots:
+            key = os.path.basename(root)
+            self._stats[key] = {}
+            metadata = pd.read_csv(os.path.join(root, 'metadata.csv'))
+            self._stats[key]['Total'] = len(metadata)
+            metadata, stats = self.filter_metadata(metadata)
+            self._stats[key].update(stats)
+            self.instances.extend([(root, sha256) for sha256 in metadata['sha256'].values])
+            metadata.set_index('sha256', inplace=True)
+            self.metadata = pd.concat([self.metadata, metadata])
+        if instances is not None:
+            self.test_mode = False
+            self.instances = [inst for inst in self.instances if inst[1] in instances]
+    @abstractmethod
+    def filter_metadata(self, metadata: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:
+        pass
+    @abstractmethod
+    def get_instance(self, root: str, instance: str) -> Dict[str, Any]:
+        pass
+    def __len__(self):
+        return len(self.instances)
+    def __getitem__(self, index) -> Dict[str, Any]:
+        try:
+            root, instance = self.instances[index]
+            return self.get_instance(root, instance)
+        except Exception as e:
+            print(e)
+            return self.__getitem__(np.random.randint(0, len(self)))
+    def __str__(self):
+        lines = []
+        lines.append(self.__class__.__name__)
+        lines.append(f'  - Total instances: {len(self)}')
+        lines.append(f'  - Sources:')
+        for key, stats in self._stats.items():
+            lines.append(f'    - {key}:')
+            for k, v in stats.items():
+                lines.append(f'      - {k}: {v}')
+        return '\n'.join(lines)
+class TextConditionedMixin:
+    def __init__(self, roots, **kwargs):
+        super().__init__(roots, **kwargs)
+        self.captions = {}
+        for instance in self.instances:
+            sha256 = instance[1]
+            self.captions[sha256] = json.loads(self.metadata.loc[sha256]['captions'])
+    def filter_metadata(self, metadata):
+        metadata, stats = super().filter_metadata(metadata)
+        metadata = metadata[metadata['captions'].notna()]
+        stats['With captions'] = len(metadata)
+        return metadata, stats
+    def get_instance(self, root, instance):
+        pack = super().get_instance(root, instance)
+        text = np.random.choice(self.captions[instance])
+        pack['cond'] = text
+        return pack
+class ImageConditionedMixin:
+    def __init__(self, roots, *, image_size=518, **kwargs):
+        self.image_size = image_size
+        super().__init__(roots, **kwargs)
+    def filter_metadata(self, metadata):
+        metadata, stats = super().filter_metadata(metadata)
+        metadata = metadata[metadata[f'cond_rendered']]
+        stats['Cond rendered'] = len(metadata)
+        return metadata, stats
+    def get_instance(self, root, instance):
+        pack = super().get_instance(root, instance)
+        image_root = os.path.join(root, 'renders_cond', instance)
+        with open(os.path.join(image_root, 'transforms.json')) as f:
+            metadata = json.load(f)
+        n_views = len(metadata['frames'])
+        view = np.random.randint(n_views)
+        metadata = metadata['frames'][view]
+        image_path = os.path.join(image_root, metadata['file_path'])
+        image = Image.open(image_path)
+        alpha = np.array(image.getchannel(3))
+        bbox = np.array(alpha).nonzero()
+        bbox = [bbox[1].min(), bbox[0].min(), bbox[1].max(), bbox[0].max()]
+        center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+        hsize = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+        aug_size_ratio = 1.2
+        aug_hsize = hsize * aug_size_ratio
+        aug_center_offset = [0, 0]
+        aug_center = [center[0] + aug_center_offset[0], center[1] + aug_center_offset[1]]
+        aug_bbox = [int(aug_center[0] - aug_hsize), int(aug_center[1] - aug_hsize), int(aug_center[0] + aug_hsize), int(aug_center[1] + aug_hsize)]
+        image = image.crop(aug_bbox)
+        image = image.resize((self.image_size, self.image_size), Image.Resampling.LANCZOS)
+        alpha = image.getchannel(3)
+        image = image.convert('RGB')
+        image = torch.tensor(np.array(image)).permute(2, 0, 1).float() / 255.0
+        alpha = torch.tensor(np.array(alpha)).float() / 255.0
+        image = image * alpha.unsqueeze(0)
+        pack['cond'] = image
+        return pack

anigen/models/__init__.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import importlib
+__attributes = {
+    'AniGenSparseStructureEncoder': 'anigen_sparse_structure_vae',
+    'AniGenSparseStructureDecoder': 'anigen_sparse_structure_vae',
+    'AniGenSparseStructureFlowModel': 'anigen_sparse_structure_flow',
+    'AniGenSparseStructureFlowModelInpaint': 'anigen_sparse_structure_flow_inpaint',
+    'AniGenElasticSLatEncoder': 'structured_latent_vae',
+    'AniGenElasticSLatMeshDecoder': 'structured_latent_vae',
+    'AniGenElasticSLatGaussianDecoder': 'structured_latent_vae',
+    'AniGenSLatFlowModel': 'anigen_structured_latent_flow',
+    'AniGenElasticSLatFlowModel': 'anigen_structured_latent_flow',
+    'AniGenElasticSLatFlowModelOld': 'anigen_structured_latent_flow_old',
+    'SkinAutoEncoder': 'structured_latent_vae',
+}
+__submodules = []
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+def from_pretrained(path: str, **kwargs):
+    """
+    Load a model from a pretrained checkpoint.
+    Args:
+        path: The path to the checkpoint. Can be either local path or a Hugging Face model name.
+              NOTE: config file and model file should take the name f'{path}.json' and f'{path}.safetensors' respectively.
+        **kwargs: Additional arguments for the model constructor.
+    """
+    import os
+    import json
+    from safetensors.torch import load_file
+    is_local = os.path.exists(f"{path}.json") and os.path.exists(f"{path}.safetensors")
+    if is_local:
+        config_file = f"{path}.json"
+        model_file = f"{path}.safetensors"
+    else:
+        print(f"{path}.json and {path}.safetensors not found, trying to download from Hugging Face Hub.")
+        from huggingface_hub import hf_hub_download
+        path_parts = path.split('/')
+        repo_id = f'{path_parts[0]}/{path_parts[1]}'
+        model_name = '/'.join(path_parts[2:])
+        config_file = hf_hub_download(repo_id, f"{model_name}.json")
+        model_file = hf_hub_download(repo_id, f"{model_name}.safetensors")
+    with open(config_file, 'r') as f:
+        config = json.load(f)
+    model = __getattr__(config['name'])(**config['args'], **kwargs)
+    model.load_state_dict(load_file(model_file))
+    return model

anigen/models/anigen_sparse_structure_flow.py ADDED Viewed

	@@ -0,0 +1,487 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..modules.utils import convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder, ModulatedTransformerCrossBlock
+from ..modules.spatial import patchify, unpatchify
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class AniGenSparseStructureFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        in_channels_skl: int,
+        model_channels: int,
+        model_channels_skl: int,
+        cond_channels: int,
+        out_channels: int,
+        out_channels_skl: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        use_lora_ss: bool = False,
+        lora_lr_rate_ss: float = 0.1,
+        modules_to_freeze: Optional[List[str]] = ["blocks", "input_layer", "out_layer", "pos_emb", "t_embedder"],
+        adapter_ss_to_skl: bool = True,
+        adapter_skl_to_ss: bool = True,
+        predict_x0: bool = False,
+        predict_x0_skl: bool = False,
+        t_eps: float = 5e-2,
+        t_scale: float = 1e3,
+        z_is_global: bool = False,
+        z_skl_is_global: bool = False,
+        global_token_num: int = 1024,
+        global_token_num_skl: int = 1024,
+        cross_adapter_every: int = 4,
+        skl_cross_from_ss: bool = False,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.in_channels_skl = in_channels_skl
+        self.model_channels = model_channels
+        self.model_channels_skl = model_channels_skl
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.out_channels_skl = out_channels_skl
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch or use_lora_ss
+        self.use_lora_ss = use_lora_ss
+        self.modules_to_freeze = modules_to_freeze
+        self.adapter_ss_to_skl = adapter_ss_to_skl
+        self.adapter_skl_to_ss = adapter_skl_to_ss
+        self.predict_x0 = predict_x0
+        self.predict_x0_skl = predict_x0_skl
+        self.t_eps = t_eps
+        self.t_scale = t_scale
+        self.z_is_global = z_is_global
+        self.z_skl_is_global = z_skl_is_global
+        self.global_token_num = global_token_num
+        self.global_token_num_skl = global_token_num_skl
+        self.cross_adapter_every = int(cross_adapter_every)
+        self.skl_cross_from_ss = skl_cross_from_ss
+        self.t_embedder = TimestepEmbedder(model_channels)
+        self.t_embedder_skl = TimestepEmbedder(model_channels_skl)
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+            self.adaLN_modulation_skl = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels_skl, 6 * model_channels_skl, bias=True)
+            )
+        if pe_mode == "ape":
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [resolution // patch_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            if self.z_is_global:
+                pos_embedder = AbsolutePositionEmbedder(model_channels, 1)
+                pos_emb = pos_embedder(torch.arange(self.global_token_num, device=self.device)[:, None])
+            else:
+                pos_embedder = AbsolutePositionEmbedder(model_channels, 3)
+                pos_emb = pos_embedder(coords)
+            self.register_buffer("pos_emb", pos_emb)
+            if self.z_skl_is_global:
+                pos_embedder_skl = AbsolutePositionEmbedder(model_channels_skl, 1)
+                pos_emb_skl = pos_embedder_skl(torch.arange(self.global_token_num_skl, device=self.device)[:, None])
+            else:
+                pos_embedder_skl = AbsolutePositionEmbedder(model_channels_skl, 3)
+                pos_emb_skl = pos_embedder_skl(coords)
+            self.register_buffer("pos_emb_skl", pos_emb_skl)
+        self.input_layer = nn.Linear(in_channels * patch_size**3, model_channels)
+        self.input_layer_skl = nn.Linear(in_channels_skl * patch_size**3, model_channels_skl)
+        shallow = max(1, num_blocks // 3)
+        middle = max(1, num_blocks // 3 * 2)
+        self.blocks = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels,
+                cond_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+                use_lora_self=self.use_lora_ss and idx >= middle,
+                lora_rank_self=8,
+                use_lora_cross=self.use_lora_ss,
+                lora_rank_cross=8+(idx // shallow)*8,
+                lora_lr_rate=lora_lr_rate_ss,
+            )
+            for idx in range(num_blocks)
+        ])
+        self.blocks_skl = nn.ModuleList([
+            ModulatedTransformerCrossBlock(
+                model_channels_skl,
+                cond_channels if not self.skl_cross_from_ss else model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode='full',
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                share_mod=share_mod,
+                qk_rms_norm=self.qk_rms_norm,
+                qk_rms_norm_cross=self.qk_rms_norm_cross,
+                use_context_norm=self.skl_cross_from_ss,
+            )
+            for _ in range(num_blocks)
+        ])
+        # When using global tokens, ss and skl token counts may differ, so we use cross-attention
+        # for information exchange at a configurable frequency.
+        self.use_cross_adapter = (self.z_is_global or self.z_skl_is_global) and (
+            self.adapter_ss_to_skl or self.adapter_skl_to_ss
+        )
+        if self.adapter_ss_to_skl and not self.use_cross_adapter:
+            self.adapter_ss_to_skl_layers = nn.ModuleList([
+                nn.Linear(model_channels, model_channels_skl) for _ in range(num_blocks)
+            ])
+        if self.adapter_skl_to_ss and not self.use_cross_adapter:
+            self.adapter_skl_to_ss_layers = nn.ModuleList([
+                nn.Linear(model_channels_skl, model_channels) for _ in range(num_blocks)
+            ])
+        self.cross_adapter_every = max(1, self.cross_adapter_every)
+        self.cross_block_indices: List[int] = [
+            idx for idx in range(num_blocks) if (idx + 1) % self.cross_adapter_every == 0
+        ]
+        if self.use_cross_adapter and len(self.cross_block_indices) == 0 and num_blocks > 0:
+            self.cross_block_indices = [num_blocks - 1]
+        if self.use_cross_adapter and len(self.cross_block_indices) > 0:
+            if self.adapter_ss_to_skl:
+                self.cross_blocks_ss_to_skl = nn.ModuleList([
+                    ModulatedTransformerCrossBlock(
+                        model_channels_skl,
+                        model_channels,
+                        num_heads=self.num_heads,
+                        mlp_ratio=self.mlp_ratio,
+                        attn_mode='full',
+                        use_checkpoint=self.use_checkpoint,
+                        use_rope=(pe_mode == "rope"),
+                        share_mod=share_mod,
+                        qk_rms_norm=self.qk_rms_norm,
+                        qk_rms_norm_cross=self.qk_rms_norm_cross,
+                    )
+                    for _ in self.cross_block_indices
+                ])
+                self.cross_blocks_ss_to_skl_out = nn.ModuleList([
+                    nn.Linear(model_channels_skl, model_channels_skl, bias=True)
+                    for _ in self.cross_block_indices
+                ])
+            if self.adapter_skl_to_ss:
+                self.cross_blocks_skl_to_ss = nn.ModuleList([
+                    ModulatedTransformerCrossBlock(
+                        model_channels,
+                        model_channels_skl,
+                        num_heads=self.num_heads,
+                        mlp_ratio=self.mlp_ratio,
+                        attn_mode='full',
+                        use_checkpoint=self.use_checkpoint,
+                        use_rope=(pe_mode == "rope"),
+                        share_mod=share_mod,
+                        qk_rms_norm=self.qk_rms_norm,
+                        qk_rms_norm_cross=self.qk_rms_norm_cross,
+                    )
+                    for _ in self.cross_block_indices
+                ])
+                self.cross_blocks_skl_to_ss_out = nn.ModuleList([
+                    nn.Linear(model_channels, model_channels, bias=True)
+                    for _ in self.cross_block_indices
+                ])
+        self.out_layer = nn.Linear(model_channels, out_channels * patch_size**3)
+        self.out_layer_skl = nn.Linear(model_channels_skl, out_channels_skl * patch_size**3)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for name, param in m.named_parameters():
+                                if 'lora' not in name:
+                                    param.requires_grad = False
+                    elif isinstance(mod, nn.Module):
+                        for name, param in mod.named_parameters():
+                            if 'lora' not in name:
+                                param.requires_grad = False
+                    elif isinstance(mod, torch.Tensor):
+                        if mod.requires_grad:
+                            mod.requires_grad = False
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+        self.blocks_skl.apply(convert_module_to_f16)
+        if hasattr(self, "adapter_ss_to_skl_layers"):
+            self.adapter_ss_to_skl_layers.apply(convert_module_to_f16)
+        if hasattr(self, "adapter_skl_to_ss_layers"):
+            self.adapter_skl_to_ss_layers.apply(convert_module_to_f16)
+        if getattr(self, "use_cross_adapter", False):
+            if hasattr(self, "cross_blocks_ss_to_skl"):
+                self.cross_blocks_ss_to_skl.apply(convert_module_to_f16)
+                self.cross_blocks_ss_to_skl_out.apply(convert_module_to_f16)
+            if hasattr(self, "cross_blocks_skl_to_ss"):
+                self.cross_blocks_skl_to_ss.apply(convert_module_to_f16)
+                self.cross_blocks_skl_to_ss_out.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+        self.blocks_skl.apply(convert_module_to_f32)
+        if hasattr(self, "adapter_ss_to_skl_layers"):
+            self.adapter_ss_to_skl_layers.apply(convert_module_to_f32)
+        if hasattr(self, "adapter_skl_to_ss_layers"):
+            self.adapter_skl_to_ss_layers.apply(convert_module_to_f32)
+        if getattr(self, "use_cross_adapter", False):
+            if hasattr(self, "cross_blocks_ss_to_skl"):
+                self.cross_blocks_ss_to_skl.apply(convert_module_to_f32)
+                self.cross_blocks_ss_to_skl_out.apply(convert_module_to_f32)
+            if hasattr(self, "cross_blocks_skl_to_ss"):
+                self.cross_blocks_skl_to_ss.apply(convert_module_to_f32)
+                self.cross_blocks_skl_to_ss_out.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.t_embedder_skl.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder_skl.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        if self.share_mod:
+            nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(self.adaLN_modulation_skl[-1].weight, 0)
+            nn.init.constant_(self.adaLN_modulation_skl[-1].bias, 0)
+        else:
+            for block in self.blocks:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+            for block in self.blocks_skl:
+                nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+        nn.init.constant_(self.out_layer_skl.weight, 0)
+        nn.init.constant_(self.out_layer_skl.bias, 0)
+        # Zero-out adapter layers if exist
+        if hasattr(self, "adapter_ss_to_skl_layers"):
+            for layer in self.adapter_ss_to_skl_layers:
+                nn.init.constant_(layer.weight, 0)
+                nn.init.constant_(layer.bias, 0)
+        if hasattr(self, "adapter_skl_to_ss_layers"):
+            for layer in self.adapter_skl_to_ss_layers:
+                nn.init.constant_(layer.weight, 0)
+                nn.init.constant_(layer.bias, 0)
+        # Zero-out cross adapter output projections (so we can safely finetune from pretrained ckpt)
+        if getattr(self, "use_cross_adapter", False):
+            if hasattr(self, "cross_blocks_ss_to_skl_out"):
+                for layer in self.cross_blocks_ss_to_skl_out:
+                    nn.init.constant_(layer.weight, 0)
+                    nn.init.constant_(layer.bias, 0)
+            if hasattr(self, "cross_blocks_skl_to_ss_out"):
+                for layer in self.cross_blocks_skl_to_ss_out:
+                    nn.init.constant_(layer.weight, 0)
+                    nn.init.constant_(layer.bias, 0)
+    def forward(self, x: torch.Tensor, x_skl: torch.Tensor, t: torch.Tensor, cond: torch.Tensor, **kwargs) -> torch.Tensor:
+        if not self.z_is_global:
+            assert [*x.shape] == [x.shape[0], self.in_channels, *[self.resolution] * 3], \
+                    f"Input shape mismatch, got {x.shape}, expected {[x.shape[0], self.in_channels, *[self.resolution] * 3]}"
+        if not self.z_skl_is_global:
+            assert [*x_skl.shape] == [x_skl.shape[0], self.in_channels_skl, *[self.resolution] * 3], \
+                    f"Input shape mismatch, got {x_skl.shape}, expected {[x_skl.shape[0], self.in_channels_skl, *[self.resolution] * 3]}"
+        if self.predict_x0:
+            xt = x.clone()
+        if self.predict_x0_skl:
+            xt_skl = x_skl.clone()
+        if not self.z_is_global:
+            h = patchify(x, self.patch_size)
+            h = h.view(*h.shape[:2], -1).permute(0, 2, 1).contiguous()
+        else:
+            h = x
+        if not self.z_skl_is_global:
+            h_skl = patchify(x_skl, self.patch_size)
+            h_skl = h_skl.view(*h_skl.shape[:2], -1).permute(0, 2, 1).contiguous()
+        else:
+            h_skl = x_skl
+        h = self.input_layer(h)
+        h = h + self.pos_emb[None]
+        h_skl = self.input_layer_skl(h_skl)
+        h_skl = h_skl + self.pos_emb_skl[None]
+        t_emb = self.t_embedder(t)
+        t_emb_skl = self.t_embedder_skl(t)
+        if self.share_mod:
+            t_emb = self.adaLN_modulation(t_emb)
+            t_emb_skl = self.adaLN_modulation_skl(t_emb_skl)
+        t_emb = t_emb.type(self.dtype)
+        t_emb_skl = t_emb_skl.type(self.dtype)
+        h = h.type(self.dtype)
+        h_skl = h_skl.type(self.dtype)
+        cond = cond.type(self.dtype)
+        cross_pos_to_idx = None
+        if self.use_cross_adapter and len(self.cross_block_indices) > 0:
+            cross_pos_to_idx = {bidx: cidx for cidx, bidx in enumerate(self.cross_block_indices)}
+        for idx, block, block_skl in zip(range(len(self.blocks)), self.blocks, self.blocks_skl):
+            f = block(h, t_emb, cond)
+            f_skl = block_skl(h_skl, t_emb_skl, h if self.skl_cross_from_ss else cond)
+            if self.use_cross_adapter and cross_pos_to_idx is not None and idx in cross_pos_to_idx:
+                cidx = cross_pos_to_idx[idx]
+                if self.adapter_ss_to_skl:
+                    out_skl = self.cross_blocks_ss_to_skl[cidx](f_skl, t_emb_skl, f)
+                    h_skl = f_skl + self.cross_blocks_ss_to_skl_out[cidx](out_skl - f_skl)
+                else:
+                    h_skl = f_skl
+                if self.adapter_skl_to_ss:
+                    out = self.cross_blocks_skl_to_ss[cidx](f, t_emb, f_skl)
+                    h = f + self.cross_blocks_skl_to_ss_out[cidx](out - f)
+                else:
+                    h = f
+            else:
+                # Non-global (or no cross block at this idx): keep previous behavior.
+                if self.adapter_ss_to_skl and (not self.use_cross_adapter):
+                    h_skl = f_skl + self.adapter_ss_to_skl_layers[idx](f)
+                else:
+                    h_skl = f_skl
+                if self.adapter_skl_to_ss and (not self.use_cross_adapter):
+                    h = f + self.adapter_skl_to_ss_layers[idx](f_skl)
+                else:
+                    h = f
+        h = h.type(x.dtype)
+        h = F.layer_norm(h, h.shape[-1:])
+        h = self.out_layer(h)
+        h_skl = h_skl.type(x_skl.dtype)
+        h_skl = F.layer_norm(h_skl, h_skl.shape[-1:])
+        h_skl = self.out_layer_skl(h_skl)
+        if not self.z_is_global:
+            h = h.permute(0, 2, 1).view(h.shape[0], h.shape[2], *[self.resolution // self.patch_size] * 3)
+            h = unpatchify(h, self.patch_size).contiguous()
+        if not self.z_skl_is_global:
+            h_skl = h_skl.permute(0, 2, 1).view(h_skl.shape[0], h_skl.shape[2], *[self.resolution // self.patch_size] * 3)
+            h_skl = unpatchify(h_skl, self.patch_size).contiguous()
+        if self.predict_x0:
+            t_normalized = t / self.t_scale
+            factor = (1 / t_normalized.clamp_min(self.t_eps)).reshape([t.shape[0], *([1] * (x.dim() - 1))])
+            h = (xt - h) * factor
+        if self.predict_x0_skl:
+            t_normalized = t / self.t_scale
+            factor = (1 / t_normalized.clamp_min(self.t_eps)).reshape([t.shape[0], *([1] * (x_skl.dim() - 1))])
+            h_skl = (xt_skl - h_skl) * factor
+        return h, h_skl

anigen/models/anigen_sparse_structure_vae.py ADDED Viewed

	@@ -0,0 +1,729 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..modules.norm import GroupNorm32, ChannelLayerNorm32
+from ..modules.spatial import pixel_shuffle_3d
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import FeedForwardNet, TransformerBlock, TransformerCrossBlock, AbsolutePositionEmbedder
+def norm_layer(norm_type: str, *args, **kwargs) -> nn.Module:
+    """
+    Return a normalization layer.
+    """
+    if norm_type == "group":
+        return GroupNorm32(32, *args, **kwargs)
+    elif norm_type == "layer":
+        return ChannelLayerNorm32(*args, **kwargs)
+    else:
+        raise ValueError(f"Invalid norm type {norm_type}")
+class ResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        out_channels: Optional[int] = None,
+        norm_type: Literal["group", "layer"] = "layer",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.norm1 = norm_layer(norm_type, channels)
+        self.norm2 = norm_layer(norm_type, self.out_channels)
+        self.conv1 = nn.Conv3d(channels, self.out_channels, 3, padding=1)
+        self.conv2 = zero_module(nn.Conv3d(self.out_channels, self.out_channels, 3, padding=1))
+        self.skip_connection = nn.Conv3d(channels, self.out_channels, 1) if channels != self.out_channels else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class DownsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "avgpool"] = "conv",
+    ):
+        assert mode in ["conv", "avgpool"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels, 2, stride=2)
+        elif mode == "avgpool":
+            assert in_channels == out_channels, "Pooling mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            return self.conv(x)
+        else:
+            return F.avg_pool3d(x, 2)
+class UpsampleBlock3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        mode: Literal["conv", "nearest"] = "conv",
+    ):
+        assert mode in ["conv", "nearest"], f"Invalid mode {mode}"
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if mode == "conv":
+            self.conv = nn.Conv3d(in_channels, out_channels*8, 3, padding=1)
+        elif mode == "nearest":
+            assert in_channels == out_channels, "Nearest mode requires in_channels to be equal to out_channels"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "conv"):
+            x = self.conv(x)
+            return pixel_shuffle_3d(x, 2)
+        else:
+            return F.interpolate(x, scale_factor=2, mode="nearest")
+class AniGenSparseStructureEncoder(nn.Module):
+    """
+    Encoder for Sparse Structure (\mathcal{E}_S in the paper Sec. 3.3).
+    Args:
+        in_channels (int): Channels of the input.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the encoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        in_channels_skl: int,
+        latent_channels: int,
+        latent_channels_skl: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+        encode_global: bool = False,
+        global_token_num: int = 1024,
+        encode_global_skl: bool = True,
+        global_token_num_skl: int = 1024,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        modules_to_freeze: Optional[List[str]] = ["input_layer", "blocks", "middle_block", "out_layer"],
+        latent_denoising: bool = False,
+        latent_denoising_skl: bool = True,
+        normalize_z: bool = False,
+        normalize_z_skl: bool = True,
+        normalize_scale: float = 1.0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.in_channels_skl = in_channels_skl
+        self.latent_channels = latent_channels
+        self.latent_channels_skl = latent_channels_skl
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.encode_global = encode_global
+        self.global_token_num = global_token_num
+        self.encode_global_skl = encode_global_skl
+        self.global_token_num_skl = global_token_num_skl
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch
+        self.latent_denoising = latent_denoising
+        self.latent_denoising_skl = latent_denoising_skl
+        self.normalize_latent = normalize_z and latent_denoising
+        self.normalize_latent_skl = normalize_z_skl and latent_denoising_skl
+        self.normalize_scale = normalize_scale
+        self.input_layer = nn.Conv3d(self.in_channels, channels[0], 3, padding=1)
+        self.input_layer_skl = nn.Conv3d(self.in_channels_skl, channels[0], 3, padding=1)
+        self.blocks = nn.ModuleList([])
+        self.blocks_skl = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            self.blocks_skl.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+                self.blocks_skl.append(
+                    DownsampleBlock3d(ch, channels[i+1])
+                )
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.middle_block_skl = nn.Sequential(*[
+            ResBlock3d(channels[-1] if _ == 0 else channels[-1], channels[-1])
+            for _ in range(num_res_blocks_middle)
+        ])
+        if self.encode_global:
+            # Initial Tokens and PE
+            self.init_tokens_ss = nn.Parameter(torch.zeros(1, global_token_num, channels[-1]))
+            pos_embedder = AbsolutePositionEmbedder(channels[-1], 1)
+            coords = torch.arange(global_token_num, device=self.device).reshape(-1, 1)
+            tokens_pos_emb = pos_embedder(coords)
+            self.register_buffer('tokens_pos_emb_ss', tokens_pos_emb)
+            # Grids PE
+            upsample_factor = 2 ** (len(channels) - 1)
+            self.base_size_ss = 64 // upsample_factor
+            pos_embedder = AbsolutePositionEmbedder(channels[-1], 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [self.base_size_ss] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            grid_pos_emb = pos_embedder(coords)
+            self.register_buffer("grid_pos_emb_ss", grid_pos_emb)
+            # Token projection layer
+            self.token_proj_ss = nn.Linear(channels[-1]*2, channels[-1])
+            # Out layers
+            self.out_layer = nn.ModuleList(
+                [TransformerCrossBlock(
+                    channels=channels[-1],
+                    ctx_channels=channels[-1]*2,
+                    out_channels=channels[-1],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                    x_is_query=False)] +
+                [TransformerBlock(
+                    channels=channels[-1],
+                    out_channels=channels[-1],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                ) for _ in range(4)] +
+                [FeedForwardNet(
+                    channels=channels[-1],
+                    out_channels=latent_channels*2 if not self.latent_denoising else latent_channels)]
+            )
+        else:
+            self.out_layer = nn.Sequential(
+                norm_layer(norm_type, channels[-1]),
+                nn.SiLU(),
+                nn.Conv3d(channels[-1], latent_channels*2 if not self.latent_denoising else latent_channels, 3, padding=1)
+            )
+        if self.encode_global_skl:
+            # Initial Tokens and PE
+            self.init_tokens = nn.Parameter(torch.zeros(1, global_token_num_skl, channels[-1]))
+            pos_embedder = AbsolutePositionEmbedder(channels[-1], 1)
+            coords = torch.arange(global_token_num_skl, device=self.device).reshape(-1, 1)
+            tokens_pos_emb = pos_embedder(coords)
+            self.register_buffer('tokens_pos_emb', tokens_pos_emb)
+            # Grids PE
+            upsample_factor = 2 ** (len(channels) - 1)
+            self.base_size = 64 // upsample_factor
+            pos_embedder = AbsolutePositionEmbedder(channels[-1], 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [self.base_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            grid_pos_emb = pos_embedder(coords)
+            self.register_buffer("grid_pos_emb", grid_pos_emb)
+            # Token projection layer
+            self.token_proj = nn.Linear(channels[-1]*2, channels[-1])
+            # Out layers
+            self.out_layer_skl = nn.ModuleList(
+                [TransformerCrossBlock(
+                    channels=channels[-1],
+                    ctx_channels=channels[-1]*2,
+                    out_channels=channels[-1],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                    x_is_query=False)] +
+                [TransformerBlock(
+                    channels=channels[-1],
+                    out_channels=channels[-1],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                ) for _ in range(4)] +
+                [FeedForwardNet(
+                    channels=channels[-1],
+                    out_channels=latent_channels_skl*2 if not self.latent_denoising_skl else latent_channels_skl)]
+            )
+        else:
+            self.out_layer_skl = nn.Sequential(
+                norm_layer(norm_type, channels[-1]),
+                nn.SiLU(),
+                nn.Conv3d(channels[-1], latent_channels_skl*2 if not self.latent_denoising_skl else latent_channels_skl, 3, padding=1)
+            )
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            # Freeze: self.input_layer, self.blocks, self.middle_block, self.out_layer
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for param in m.parameters():
+                                param.requires_grad = False
+                    else:
+                        for param in mod.parameters():
+                            param.requires_grad = False
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.blocks_skl.apply(convert_module_to_f16)
+        self.middle_block_skl.apply(convert_module_to_f16)
+        if self.encode_global_skl:
+            self.token_proj.apply(convert_module_to_f16)
+            self.out_layer_skl.apply(convert_module_to_f16)
+        if self.encode_global:
+            self.token_proj_ss.apply(convert_module_to_f16)
+            self.out_layer.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.blocks_skl.apply(convert_module_to_f32)
+        self.middle_block_skl.apply(convert_module_to_f32)
+        if self.encode_global_skl:
+            self.token_proj.apply(convert_module_to_f32)
+            self.out_layer_skl.apply(convert_module_to_f32)
+        if self.encode_global:
+            self.token_proj_ss.apply(convert_module_to_f32)
+            self.out_layer.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.kaiming_uniform_(module.weight, nonlinearity='linear')
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: torch.Tensor, x_skl: torch.Tensor = None, sample_posterior: bool = False, return_raw: bool = False) -> torch.Tensor:
+        h = self.input_layer(x)
+        h = h.type(self.dtype)
+        h_skl = self.input_layer_skl(x_skl)
+        h_skl = h_skl.type(self.dtype)
+        for block, block_skl in zip(self.blocks, self.blocks_skl):
+            h_skl = block_skl(h_skl)
+            h = block(h)
+        h_skl = self.middle_block_skl(h_skl)
+        h = self.middle_block(h)
+        if self.encode_global:
+            B, C, D, H, W = h.shape
+            h = h.view(B, C, D*H*W).permute(0, 2, 1)  # B, N, C
+            h = torch.cat([h, self.grid_pos_emb_ss[None].expand(B, -1, -1)], dim=-1).type(h.dtype)
+            init_tokens = torch.cat([self.init_tokens_ss, self.tokens_pos_emb_ss[None].expand_as(self.init_tokens_ss)], dim=-1).type(h.dtype)
+            tokens = self.token_proj_ss(init_tokens.expand(B, -1, -1))
+            h = self.out_layer[0](tokens, h)  # B, global_token_num, C
+            for layer in self.out_layer[1:]:
+                h = layer(h)
+            h = h.type(x.dtype)
+            if self.latent_denoising:
+                if self.normalize_latent:
+                    h = nn.functional.normalize(h, dim=-1) * self.normalize_scale
+                mean = h
+                logvar = torch.zeros_like(h)
+            else:
+                mean, logvar = h.chunk(2, dim=2)  # B, global_token_num, C
+            if sample_posterior and not self.latent_denoising:
+                std = torch.exp(0.5 * logvar)
+                z = mean + std * torch.randn_like(std)
+            else:
+                z = mean
+        else:
+            h = h.type(x.dtype)
+            h = self.out_layer(h)
+            if self.latent_denoising:
+                if self.normalize_latent:
+                    h = nn.functional.normalize(h, dim=1) * self.normalize_scale
+                mean = h
+                logvar = torch.zeros_like(h)
+            else:
+                mean, logvar = h.chunk(2, dim=1)
+            if sample_posterior and not self.latent_denoising:
+                std = torch.exp(0.5 * logvar)
+                z = mean + std * torch.randn_like(std)
+            else:
+                z = mean
+        if self.encode_global_skl:
+            B, C, D, H, W = h_skl.shape
+            h_skl = h_skl.view(B, C, D*H*W).permute(0, 2, 1)  # B, N, C
+            h_skl = torch.cat([h_skl, self.grid_pos_emb[None].expand(B, -1, -1)], dim=-1).type(h_skl.dtype)
+            init_tokens = torch.cat([self.init_tokens, self.tokens_pos_emb[None].expand_as(self.init_tokens)], dim=-1).type(h_skl.dtype)
+            tokens = self.token_proj(init_tokens.expand(B, -1, -1))
+            h_skl = self.out_layer_skl[0](tokens, h_skl)  # B, global_token_num_skl, C
+            for layer in self.out_layer_skl[1:]:
+                h_skl = layer(h_skl)
+            h_skl = h_skl.type(x_skl.dtype)
+            if self.latent_denoising_skl:
+                if self.normalize_latent_skl:
+                    h_skl = nn.functional.normalize(h_skl, dim=-1) * self.normalize_scale
+                mean_skl = h_skl
+                logvar_skl = torch.zeros_like(h_skl)
+            else:
+                mean_skl, logvar_skl = h_skl.chunk(2, dim=2)  # B, global_token_num_skl, C
+            if sample_posterior and not self.latent_denoising_skl:
+                std_skl = torch.exp(0.5 * logvar_skl)
+                z_skl = mean_skl + std_skl * torch.randn_like(std_skl)
+            else:
+                z_skl = mean_skl
+        else:
+            h_skl = h_skl.type(x_skl.dtype)
+            h_skl = self.out_layer_skl(h_skl)
+            if self.latent_denoising_skl:
+                if self.normalize_latent_skl:
+                    h_skl = nn.functional.normalize(h_skl, dim=1) * self.normalize_scale
+                mean_skl = h_skl
+                logvar_skl = torch.zeros_like(h_skl)
+            else:
+                mean_skl, logvar_skl = h_skl.chunk(2, dim=1)
+            if sample_posterior and not self.latent_denoising_skl:
+                std_skl = torch.exp(0.5 * logvar_skl)
+                z_skl = mean_skl + std_skl * torch.randn_like(std_skl)
+            else:
+                z_skl = mean_skl
+        if self.latent_denoising:
+            mean = mean.detach()
+        if self.latent_denoising_skl:
+            mean_skl = mean_skl.detach()
+        if return_raw:
+            return z, mean, logvar, z_skl, mean_skl, logvar_skl
+        return z, z_skl
+class AniGenSparseStructureDecoder(nn.Module):
+    """
+    Decoder for Sparse Structure (\mathcal{D}_S in the paper Sec. 3.3).
+    Args:
+        out_channels (int): Channels of the output.
+        latent_channels (int): Channels of the latent representation.
+        num_res_blocks (int): Number of residual blocks at each resolution.
+        channels (List[int]): Channels of the decoder blocks.
+        num_res_blocks_middle (int): Number of residual blocks in the middle.
+        norm_type (Literal["group", "layer"]): Type of normalization layer.
+        use_fp16 (bool): Whether to use FP16.
+    """
+    def __init__(
+        self,
+        out_channels: int,
+        out_channels_skl: int,
+        latent_channels: int,
+        latent_channels_skl: int,
+        num_res_blocks: int,
+        channels: List[int],
+        num_res_blocks_middle: int = 2,
+        norm_type: Literal["group", "layer"] = "layer",
+        use_fp16: bool = False,
+        encode_global: bool = False,
+        global_token_num: int = 1024,
+        encode_global_skl: bool = True,
+        global_token_num_skl: int = 1024,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        modules_to_freeze: Optional[List[str]] = ["input_layer", "blocks", "middle_block", "out_layer"],
+        normalize_z: bool = False,
+        normalize_z_skl: bool = True,
+        normalize_scale: float = 1.0,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.out_channels_skl = out_channels_skl
+        self.latent_channels = latent_channels
+        self.latent_channels_skl = latent_channels_skl
+        self.num_res_blocks = num_res_blocks
+        self.channels = channels
+        self.num_res_blocks_middle = num_res_blocks_middle
+        self.norm_type = norm_type
+        self.use_fp16 = use_fp16
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.encode_global = encode_global
+        self.global_token_num = global_token_num
+        self.encode_global_skl = encode_global_skl
+        self.global_token_num_skl = global_token_num_skl
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch
+        self.normalize_z = normalize_z
+        self.normalize_z_skl = normalize_z_skl
+        self.normalize_scale = normalize_scale
+        if self.encode_global:
+            # Initial Grids and PE
+            upsample_factor = 2 ** (len(channels) - 1)
+            self.base_size_ss = 64 // upsample_factor
+            self.init_grids_ss = nn.Parameter(torch.zeros(1, channels[0], self.base_size_ss**3).permute(0, 2, 1).contiguous().clone())  # 1, N, C
+            pos_embedder = AbsolutePositionEmbedder(channels[0], 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [self.base_size_ss] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            grid_pos_emb = pos_embedder(coords)
+            self.register_buffer("grid_pos_emb_ss", grid_pos_emb)
+            # Tokens PE
+            pos_embedder = AbsolutePositionEmbedder(channels[0], 1)
+            coords = torch.arange(global_token_num, device=self.device).reshape(-1, 1)
+            tokens_pos_emb = pos_embedder(coords)
+            self.register_buffer('tokens_pos_emb_ss', tokens_pos_emb)
+            # Token projection layer
+            self.token_proj_ss = nn.Linear(channels[0]*2, channels[0])
+            # Input layers
+            self.input_layer = nn.ModuleList(
+                [TransformerBlock(
+                    channels=channels[0] if _ != 0 else latent_channels + channels[0],
+                    out_channels=channels[0],
+                    num_heads=4 if _ == 0 else 16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                ) for _ in range(4)] +
+                [TransformerCrossBlock(
+                    channels=channels[0],
+                    ctx_channels=channels[0],
+                    out_channels=channels[0],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                    x_is_query=False)]
+            )
+        else:
+            self.input_layer = nn.Conv3d(latent_channels, channels[0], 3, padding=1)
+        if self.encode_global_skl:
+            # Initial Grids and PE
+            upsample_factor = 2 ** (len(channels) - 1)
+            self.base_size = 64 // upsample_factor
+            self.init_grids = nn.Parameter(torch.zeros(1, channels[0], self.base_size**3).permute(0, 2, 1).contiguous().clone())  # 1, N, C
+            pos_embedder = AbsolutePositionEmbedder(channels[0], 3)
+            coords = torch.meshgrid(*[torch.arange(res, device=self.device) for res in [self.base_size] * 3], indexing='ij')
+            coords = torch.stack(coords, dim=-1).reshape(-1, 3)
+            grid_pos_emb = pos_embedder(coords)
+            self.register_buffer("grid_pos_emb", grid_pos_emb)
+            # Tokens PE
+            pos_embedder = AbsolutePositionEmbedder(channels[0], 1)
+            coords = torch.arange(global_token_num_skl, device=self.device).reshape(-1, 1)
+            tokens_pos_emb = pos_embedder(coords)
+            self.register_buffer('tokens_pos_emb', tokens_pos_emb)
+            # Token projection layer
+            self.token_proj = nn.Linear(channels[0]*2, channels[0])
+            # Input layers
+            self.input_layer_skl = nn.ModuleList(
+                [TransformerBlock(
+                    channels=channels[0] if _ != 0 else latent_channels_skl + channels[0],
+                    out_channels=channels[0],
+                    num_heads=4 if _ == 0 else 16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                ) for _ in range(4)] +
+                [TransformerCrossBlock(
+                    channels=channels[0],
+                    ctx_channels=channels[0],
+                    out_channels=channels[0],
+                    num_heads=16,
+                    attn_mode="full",
+                    qkv_bias=False,
+                    x_is_query=False)]
+            )
+        else:
+            self.input_layer_skl = nn.Conv3d(latent_channels_skl, channels[0], 3, padding=1)
+        self.middle_block = nn.Sequential(*[
+            ResBlock3d(channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.middle_block_skl = nn.Sequential(*[
+            ResBlock3d(channels[0] if _ == 0 else channels[0], channels[0])
+            for _ in range(num_res_blocks_middle)
+        ])
+        self.blocks = nn.ModuleList([])
+        self.blocks_skl = nn.ModuleList([])
+        for i, ch in enumerate(channels):
+            self.blocks.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+            self.blocks_skl.extend([
+                ResBlock3d(ch, ch)
+                for _ in range(num_res_blocks)
+            ])
+            if i < len(channels) - 1:
+                self.blocks_skl.append(
+                    UpsampleBlock3d(ch, channels[i+1])
+                )
+        self.out_layer = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], self.out_channels, 3, padding=1)
+        )
+        self.out_layer_skl = nn.Sequential(
+            norm_layer(norm_type, channels[-1]),
+            nn.SiLU(),
+            nn.Conv3d(channels[-1], self.out_channels_skl, 3, padding=1)
+        )
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            # Freeze: self.input_layer, self.blocks, self.middle_block, self.out_layer
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for param in m.parameters():
+                                param.requires_grad = False
+                    else:
+                        for param in mod.parameters():
+                            param.requires_grad = False
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.use_fp16 = True
+        self.dtype = torch.float16
+        self.blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.blocks_skl.apply(convert_module_to_f16)
+        self.middle_block_skl.apply(convert_module_to_f16)
+        if self.encode_global_skl:
+            self.token_proj.apply(convert_module_to_f16)
+            self.input_layer_skl.apply(convert_module_to_f16)
+        if self.encode_global:
+            self.token_proj_ss.apply(convert_module_to_f16)
+            self.input_layer.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.use_fp16 = False
+        self.dtype = torch.float32
+        self.blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.blocks_skl.apply(convert_module_to_f32)
+        self.middle_block_skl.apply(convert_module_to_f32)
+        if self.encode_global_skl:
+            self.token_proj.apply(convert_module_to_f32)
+            self.input_layer_skl.apply(convert_module_to_f32)
+        if self.encode_global:
+            self.token_proj_ss.apply(convert_module_to_f32)
+            self.input_layer.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.kaiming_uniform_(module.weight, nonlinearity='linear')
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: torch.Tensor, x_skl: torch.Tensor) -> torch.Tensor:
+        h = F.normalize(x, dim=1) * self.normalize_scale if self.normalize_z else x
+        h_skl = F.normalize(x_skl, dim=1) * self.normalize_scale if self.normalize_z_skl else x_skl
+        if self.encode_global:
+            B, _, _ = h.shape
+            h = torch.cat([h, self.tokens_pos_emb_ss[None].expand(B, -1, -1)], dim=-1).type(self.dtype)
+            h = h.type(self.dtype)
+            for layer in self.input_layer[:-1]:
+                h = layer(h)
+            init_grids = torch.cat([self.init_grids_ss, self.grid_pos_emb_ss[None].expand_as(self.init_grids_ss)], dim=-1).type(self.dtype)
+            grids = self.token_proj_ss(init_grids.expand(B, -1, -1))
+            h = self.input_layer[-1](grids, h)  # B, N, C
+            h = h.permute(0, 2, 1).view(B, -1, self.base_size, self.base_size, self.base_size)
+        else:
+            h = self.input_layer(h)
+            h = h.type(self.dtype)
+        if self.encode_global_skl:
+            B, _, _ = h_skl.shape
+            h_skl = torch.cat([h_skl, self.tokens_pos_emb[None].expand(B, -1, -1)], dim=-1).type(self.dtype)
+            h_skl = h_skl.type(self.dtype)
+            for layer in self.input_layer_skl[:-1]:
+                h_skl = layer(h_skl)
+            init_grids = torch.cat([self.init_grids, self.grid_pos_emb[None].expand_as(self.init_grids)], dim=-1).type(self.dtype)
+            grids = self.token_proj(init_grids.expand(B, -1, -1))
+            h_skl = self.input_layer_skl[-1](grids, h_skl)  # B, N, C
+            h_skl = h_skl.permute(0, 2, 1).view(B, -1, self.base_size, self.base_size, self.base_size)
+        else:
+            h_skl = self.input_layer_skl(h_skl)
+            h_skl = h_skl.type(self.dtype)
+        h_skl = self.middle_block_skl(h_skl)
+        h = self.middle_block(h)
+        for block, block_skl in zip(self.blocks, self.blocks_skl):
+            h_skl = block_skl(h_skl)
+            h = block(h)
+        h = h.type(x.dtype)
+        h = self.out_layer(h)
+        h_skl = h_skl.type(x.dtype)
+        h_skl = self.out_layer_skl(h_skl)
+        return h, h_skl

anigen/models/anigen_structured_latent_flow.py ADDED Viewed

	@@ -0,0 +1,553 @@

+from typing import *
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from anigen.modules.transformer import blocks
+from ..modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ..modules.transformer import AbsolutePositionEmbedder
+from ..modules.norm import LayerNorm32
+from ..modules import sparse as sp
+from ..modules.sparse.transformer import ModulatedSparseTransformerCrossBlock
+from .sparse_elastic_mixin import SparseTransformerElasticMixin
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t: a 1-D Tensor of N indices, one per batch element.
+                These may be fractional.
+            dim: the dimension of the output.
+            max_period: controls the minimum frequency of the embeddings.
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -np.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class SparseResBlock3d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: Optional[int] = None,
+        downsample: bool = False,
+        upsample: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.out_channels = out_channels or channels
+        self.downsample = downsample
+        self.upsample = upsample
+        assert not (downsample and upsample), "Cannot downsample and upsample at the same time"
+        self.norm1 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm2 = LayerNorm32(self.out_channels, elementwise_affine=False, eps=1e-6)
+        self.conv1 = sp.SparseConv3d(channels, self.out_channels, 3)
+        self.conv2 = zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3))
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(emb_channels, 2 * self.out_channels, bias=True),
+        )
+        self.skip_connection = sp.SparseLinear(channels, self.out_channels) if channels != self.out_channels else nn.Identity()
+        self.updown = None
+        if self.downsample:
+            self.updown = sp.SparseDownsample(2)
+        elif self.upsample:
+            self.updown = sp.SparseUpsample(2)
+    def _updown(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        if self.updown is not None:
+            x = self.updown(x)
+        return x
+    def forward(self, x: sp.SparseTensor, emb: torch.Tensor) -> sp.SparseTensor:
+        emb_out = self.emb_layers(emb).type(x.dtype)
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        x = self._updown(x)
+        h = x.replace(self.norm1(x.feats))
+        h = h.replace(F.silu(h.feats))
+        h = self.conv1(h)
+        h = h.replace(self.norm2(h.feats)) * (1 + scale) + shift
+        h = h.replace(F.silu(h.feats))
+        h = self.conv2(h)
+        h = h + self.skip_connection(x)
+        return h
+class AniGenSLatFlowModel(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        in_channels_vert_skin: int,
+        in_channels_skl: int,
+        model_channels: int,
+        model_channels_vert_skin: int,
+        model_channels_skl: int,
+        cond_channels: int,
+        out_channels: int,
+        out_channels_vert_skin: int,
+        out_channels_skl: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        num_heads_vert_skin: Optional[int] = None,
+        num_head_channels_vert_skin: Optional[int] = 64,
+        num_heads_skl: Optional[int] = None,
+        num_head_channels_skl: Optional[int] = 64,
+        mlp_ratio: float = 4,
+        patch_size: int = 2,
+        num_io_res_blocks: int = 2,
+        num_io_res_blocks_vert_skin: int = 2,
+        num_io_res_blocks_skl: int = 2,
+        io_block_channels: List[int] = None,
+        io_block_channels_vert_skin: List[int] = None,
+        io_block_channels_skl: List[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        use_skip_connection: bool = True,
+        share_mod: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        modules_to_freeze: Optional[List[str]] = ['blocks', 'input_blocks','input_layer', 'out_blocks', 'out_layer', 't_embedder'],
+        predict_x0: bool = False,
+        t_eps: float = 5e-2,
+        t_scale: float = 1e3,
+        use_joint_num_cond: bool = False,
+        joint_num_max: int = 60,
+        joint_num_fourier_bands: int = 6,
+    ):
+        super().__init__()
+        self.pretrain_class_name = ["AniGenSlatFlowImage"]
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.in_channels_vert_skin = in_channels_vert_skin
+        self.in_channels_skl = in_channels_skl
+        self.model_channels = model_channels
+        self.model_channels_vert_skin = model_channels_vert_skin
+        self.model_channels_skl = model_channels_skl
+        self.cond_channels = cond_channels
+        self.out_channels = out_channels
+        self.out_channels_vert_skin = out_channels_vert_skin
+        self.out_channels_skl = out_channels_skl
+        self.num_blocks = num_blocks
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.num_heads_vert_skin = num_heads_vert_skin or model_channels_vert_skin // num_head_channels_vert_skin
+        self.num_heads_skl = num_heads_skl or model_channels_skl // num_head_channels_skl
+        self.mlp_ratio = mlp_ratio
+        self.patch_size = patch_size
+        self.num_io_res_blocks = num_io_res_blocks
+        self.num_io_res_blocks_vert_skin = num_io_res_blocks_vert_skin
+        self.num_io_res_blocks_skl = num_io_res_blocks_skl
+        self.io_block_channels = io_block_channels
+        self.io_block_channels_vert_skin = io_block_channels_vert_skin
+        self.io_block_channels_skl = io_block_channels_skl
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.use_skip_connection = use_skip_connection
+        self.share_mod = share_mod
+        self.qk_rms_norm = qk_rms_norm
+        self.qk_rms_norm_cross = qk_rms_norm_cross
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.predict_x0 = predict_x0
+        self.t_eps = t_eps
+        self.t_scale = t_scale
+        self.use_joint_num_cond = use_joint_num_cond
+        self.joint_num_max = joint_num_max
+        self.joint_num_fourier_bands = joint_num_fourier_bands
+        if self.io_block_channels is not None:
+            assert int(np.log2(patch_size)) == np.log2(patch_size), "Patch size must be a power of 2"
+            assert np.log2(patch_size) == len(io_block_channels), "Number of IO ResBlocks must match the number of stages"
+        self.t_embedder = TimestepEmbedder(model_channels)
+        self.t_embedder_vert_skin = TimestepEmbedder(model_channels_vert_skin)
+        self.t_embedder_skl = TimestepEmbedder(model_channels_skl)
+        if self.use_joint_num_cond:
+            # Joint-number conditioning (applied to skin + skeleton branches).
+            # If joints_num is missing/<=0, use learnable unconditional embeddings.
+            self.joint_num_embedder_vert_skin = nn.Sequential(
+                nn.Linear(2 * joint_num_fourier_bands, model_channels_vert_skin, bias=True),
+                nn.SiLU(),
+                nn.Linear(model_channels_vert_skin, model_channels_vert_skin, bias=True),
+            )
+            self.joint_num_embedder_skl = nn.Sequential(
+                nn.Linear(2 * joint_num_fourier_bands, model_channels_skl, bias=True),
+                nn.SiLU(),
+                nn.Linear(model_channels_skl, model_channels_skl, bias=True),
+            )
+            self.joint_num_uncond_vert_skin = nn.Parameter(torch.zeros(model_channels_vert_skin))
+            self.joint_num_uncond_skl = nn.Parameter(torch.zeros(model_channels_skl))
+        if share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels, 6 * model_channels, bias=True)
+            )
+            self.adaLN_modulation_vert_skin = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels_vert_skin, 6 * model_channels_vert_skin, bias=True)
+            )
+            self.adaLN_modulation_skl = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(model_channels_skl, 6 * model_channels_skl, bias=True)
+            )
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+            self.pos_embedder_vert_skin = AbsolutePositionEmbedder(model_channels_vert_skin)
+            self.pos_embedder_skl = AbsolutePositionEmbedder(model_channels_skl)
+        # Causuality in conditioning:
+        # Geometry <- Conditioned Image (Cross Attention)
+        # Skinning <- Geometry (Adapter Layer) + Skeleton (Cross Attention)
+        # Skeleton <- Skinning (Cross Attention)
+        causial_cond_channels_dict = {'': cond_channels, '_vert_skin': self.model_channels_skl, '_skl': self.model_channels_vert_skin}
+        for postfix in ['', '_vert_skin', '_skl']:
+            # Input blocks
+            setattr(self, f'input_layer{postfix}', sp.SparseLinear(
+                getattr(self, f'in_channels{postfix}'),
+                getattr(self, f'model_channels{postfix}') if getattr(self, f'io_block_channels{postfix}') is None else getattr(self, f'io_block_channels{postfix}')[0]
+            ))
+            setattr(self, f'input_blocks{postfix}', nn.ModuleList([]))
+            io_block_channels = getattr(self, f'io_block_channels{postfix}')
+            model_channels = getattr(self, f'model_channels{postfix}')
+            num_io_res_blocks = getattr(self, f'num_io_res_blocks{postfix}')
+            if io_block_channels is not None:
+                for chs, next_chs in zip(io_block_channels, io_block_channels[1:] + [model_channels]):
+                    getattr(self, f'input_blocks{postfix}').extend([
+                        SparseResBlock3d(
+                            chs,
+                            model_channels,
+                            out_channels=chs,
+                        )
+                        for _ in range(num_io_res_blocks-1)
+                    ])
+                    getattr(self, f'input_blocks{postfix}').append(
+                        SparseResBlock3d(
+                            chs,
+                            model_channels,
+                            out_channels=next_chs,
+                            downsample=True,
+                        )
+                    )
+            # Transformer blocks
+            cond_channels_block = causial_cond_channels_dict[postfix]
+            setattr(self, f'blocks{postfix}', nn.ModuleList([
+                ModulatedSparseTransformerCrossBlock(
+                    getattr(self, f'model_channels{postfix}'),
+                    cond_channels_block,
+                    num_heads=getattr(self, f'num_heads{postfix}'),
+                    mlp_ratio=self.mlp_ratio,
+                    attn_mode='full',
+                    use_checkpoint=self.use_checkpoint,
+                    use_rope=(pe_mode == "rope"),
+                    share_mod=self.share_mod,
+                    qk_rms_norm=self.qk_rms_norm,
+                    qk_rms_norm_cross=self.qk_rms_norm_cross,
+                    norm_for_context=True,
+                )
+                for _ in range(num_blocks)
+            ]))
+            # Output blocks
+            setattr(self, f'out_blocks{postfix}', nn.ModuleList([]))
+            if io_block_channels is not None:
+                for chs, prev_chs in zip(reversed(io_block_channels), [model_channels] + list(reversed(io_block_channels[1:]))):
+                    getattr(self, f'out_blocks{postfix}').append(
+                        SparseResBlock3d(
+                            prev_chs * 2 if self.use_skip_connection else prev_chs,
+                            model_channels,
+                            out_channels=chs,
+                            upsample=True,
+                        )
+                    )
+                    getattr(self, f'out_blocks{postfix}').extend([
+                        SparseResBlock3d(
+                            chs * 2 if self.use_skip_connection else chs,
+                            model_channels,
+                            out_channels=chs,
+                        )
+                        for _ in range(num_io_res_blocks-1)
+                    ])
+            setattr(self, f'out_layer{postfix}', sp.SparseLinear(model_channels if io_block_channels is None else io_block_channels[0], getattr(self, f'out_channels{postfix}')))
+        self.adapter_geo_to_skin = nn.ModuleList([
+            sp.SparseLinear(self.model_channels, self.model_channels_vert_skin) for _ in range(num_blocks)
+        ])
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch
+        # self.is_geometry_branch_frozen = self.use_pretrain_branch and self.freeze_pretrain_branch and all([module in modules_to_freeze for module in ['blocks', 'input_blocks','input_layer', 'out_blocks', 'out_layer', 't_embedder']])
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for param in m.parameters():
+                                param.requires_grad = False
+                    else:
+                        for param in mod.parameters():
+                            param.requires_grad = False
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        for postfix in ['', '_vert_skin', '_skl']:
+            getattr(self, f'input_blocks{postfix}').apply(convert_module_to_f16)
+            getattr(self, f'blocks{postfix}').apply(convert_module_to_f16)
+            getattr(self, f'out_blocks{postfix}').apply(convert_module_to_f16)
+        self.adapter_geo_to_skin.apply(convert_module_to_f16)
+        if self.use_joint_num_cond:
+            self.joint_num_embedder_vert_skin.apply(convert_module_to_f16)
+            self.joint_num_embedder_skl.apply(convert_module_to_f16)
+            self.joint_num_uncond_vert_skin.data = self.joint_num_uncond_vert_skin.data.half()
+            self.joint_num_uncond_skl.data = self.joint_num_uncond_skl.data.half()
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        for postfix in ['', '_vert_skin', '_skl']:
+            getattr(self, f'input_blocks{postfix}').apply(convert_module_to_f32)
+            getattr(self, f'blocks{postfix}').apply(convert_module_to_f32)
+            getattr(self, f'out_blocks{postfix}').apply(convert_module_to_f32)
+        self.adapter_geo_to_skin.apply(convert_module_to_f32)
+        if self.use_joint_num_cond:
+            self.joint_num_embedder_vert_skin.apply(convert_module_to_f32)
+            self.joint_num_embedder_skl.apply(convert_module_to_f32)
+            self.joint_num_uncond_vert_skin.data = self.joint_num_uncond_vert_skin.data.float()
+            self.joint_num_uncond_skl.data = self.joint_num_uncond_skl.data.float()
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        for postfix in ['', '_vert_skin', '_skl']:
+            nn.init.normal_(getattr(self, f't_embedder{postfix}').mlp[0].weight, std=0.02)
+            nn.init.normal_(getattr(self, f't_embedder{postfix}').mlp[2].weight, std=0.02)
+            if self.share_mod:
+                nn.init.constant_(getattr(self, f'adaLN_modulation{postfix}')[-1].weight, 0)
+                nn.init.constant_(getattr(self, f'adaLN_modulation{postfix}')[-1].bias, 0)
+            else:
+                for block in getattr(self, f'blocks{postfix}'):
+                    nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+                    nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(getattr(self, f'out_layer{postfix}').weight, 0)
+            nn.init.constant_(getattr(self, f'out_layer{postfix}').bias, 0)
+        for layer in self.adapter_geo_to_skin:
+            nn.init.constant_(layer.weight, 0)
+            nn.init.constant_(layer.bias, 0)
+        if self.use_joint_num_cond:
+            # Joint-number conditioning layers
+            for emb in [self.joint_num_embedder_vert_skin, self.joint_num_embedder_skl]:
+                for m in emb.modules():
+                    if isinstance(m, nn.Linear):
+                        torch.nn.init.xavier_uniform_(m.weight)
+                        if m.bias is not None:
+                            nn.init.constant_(m.bias, 0)
+    def _fourier_encode_joint_num(self, joints_num: torch.Tensor) -> torch.Tensor:
+        """Fourier features for joints_num in [0, joint_num_max]."""
+        # Keep dtype consistent with model (e.g., fp16) to avoid Linear dtype mismatch.
+        dtype = getattr(self, 'dtype', torch.float32)
+        x = (joints_num.to(dtype=dtype) / float(self.joint_num_max)).clamp(0.0, 1.0)
+        x = x[:, None]
+        freqs = (2.0 ** torch.arange(self.joint_num_fourier_bands, device=x.device, dtype=x.dtype)) * math.pi
+        angles = x * freqs[None, :]
+        return torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1)
+    def _get_joint_num_emb(self, joints_num: Optional[torch.Tensor], batch_size: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return (emb_vert_skin, emb_skl), shape [B, C_*]."""
+        if joints_num is None:
+            joints_num = torch.zeros(batch_size, device=device)
+        elif not torch.is_tensor(joints_num):
+            joints_num = torch.tensor(joints_num, device=device)
+        joints_num = joints_num.to(device=device)
+        if joints_num.dim() == 0:
+            joints_num = joints_num[None].expand(batch_size)
+        joints_num = joints_num.reshape(batch_size)
+        mask_dtype = getattr(self, 'dtype', torch.float32)
+        uncond_mask = (joints_num <= 0).to(dtype=mask_dtype, device=device)[:, None]
+        joints_num = joints_num.clamp(min=0, max=self.joint_num_max)
+        fourier = self._fourier_encode_joint_num(joints_num)
+        emb_vs_cond = self.joint_num_embedder_vert_skin(fourier)
+        emb_skl_cond = self.joint_num_embedder_skl(fourier)
+        emb_vs_uncond = self.joint_num_uncond_vert_skin[None].expand(batch_size, -1)
+        emb_skl_uncond = self.joint_num_uncond_skl[None].expand(batch_size, -1)
+        # Blend: uncond_mask==1 -> unconditional, uncond_mask==0 -> conditional.
+        emb_vs = emb_vs_cond * (1.0 - uncond_mask) + emb_vs_uncond * uncond_mask
+        emb_skl = emb_skl_cond * (1.0 - uncond_mask) + emb_skl_uncond * uncond_mask
+        return emb_vs, emb_skl
+    def forward_stage(
+        self,
+        x: sp.SparseTensor,
+        t: torch.Tensor,
+        postfix,
+        stage,
+        cond_emb: Optional[torch.Tensor] = None,
+        t_emb=None,
+        skips=None,
+        original_dtype=None,
+    ) -> sp.SparseTensor:
+        input_layer  = getattr(self, f'input_layer{postfix}')
+        t_embedder   = getattr(self, f't_embedder{postfix}')
+        input_blocks = getattr(self, f'input_blocks{postfix}')
+        pos_embedder = getattr(self, f'pos_embedder{postfix}')
+        out_blocks   = getattr(self, f'out_blocks{postfix}')
+        out_layer    = getattr(self, f'out_layer{postfix}')
+        adaLN_modulation = getattr(self, f'adaLN_modulation{postfix}') if self.share_mod else None
+        if stage == 'in':
+            h = input_layer(x).type(self.dtype)
+            t_emb = t_embedder(t)
+            if cond_emb is not None:
+                t_emb = t_emb + cond_emb
+            t_emb = t_emb.type(self.dtype)
+            t_mod = adaLN_modulation(t_emb).type(self.dtype) if self.share_mod else t_emb
+            skips = []
+            # pack with input blocks
+            for block in input_blocks:
+                h = block(h, t_emb)
+                skips.append(h.feats)
+            if self.pe_mode == "ape":
+                h = h + pos_embedder(h.coords[:, 1:]).type(self.dtype)
+            return h, t_emb, t_mod, skips
+        elif stage == 'out':
+            h = x
+            # unpack with output blocks
+            for block, skip in zip(out_blocks, reversed(skips)):
+                if self.use_skip_connection:
+                    h = block(h.replace(torch.cat([h.feats, skip], dim=1)), t_emb)
+                else:
+                    h = block(h, t_emb)
+            h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))
+            h = out_layer(h.type(original_dtype))
+            return h
+        else:
+            raise ValueError(f"Unknown stage: {stage}")
+    def forward(self, x: sp.SparseTensor, x_skl: sp.SparseTensor, t: torch.Tensor, cond: torch.Tensor, joints_num: Optional[torch.Tensor] = None, **kwargs) -> sp.SparseTensor:
+        cond = cond.type(self.dtype)
+        feats, feats_vert_skin = x.feats[:, :self.in_channels], x.feats[:, self.in_channels:]
+        x, x_vert_skin = x.replace(feats), x.replace(feats_vert_skin)
+        if self.predict_x0:
+            xt_feats_skin, xt_feats_skl = feats_vert_skin.clone(), x_skl.feats.clone()
+        joint_emb_vs, joint_emb_skl = None, None
+        if self.use_joint_num_cond:
+            # joint-number conditioning for skin + skeleton
+            joint_emb_vs, joint_emb_skl = self._get_joint_num_emb(joints_num, x.shape[0], x.device)
+            joint_emb_vs = joint_emb_vs.type(self.dtype)
+            joint_emb_skl = joint_emb_skl.type(self.dtype)
+        in_dicts = {'': x, '_vert_skin': x_vert_skin, '_skl': x_skl}
+        cond_emb_dicts = {'': None, '_vert_skin': joint_emb_vs, '_skl': joint_emb_skl}
+        postfix_keys = list(in_dicts.keys())
+        for postfix in postfix_keys:
+            cond_emb = cond_emb_dicts[postfix]
+            in_dicts[postfix], in_dicts[f't_emb{postfix}'], in_dicts[f't_mod{postfix}'], in_dicts[f'skips{postfix}'] = self.forward_stage(in_dicts[postfix], t, postfix, stage='in', cond_emb=cond_emb)
+        for block, block_skin, block_skl, adapter in zip(self.blocks, self.blocks_vert_skin, self.blocks_skl, self.adapter_geo_to_skin):
+            h, h_skin, h_skl = in_dicts[''], in_dicts['_vert_skin'], in_dicts['_skl']
+            f = block(h, in_dicts['t_mod'], cond)
+            f_skin = block_skin(h_skin, in_dicts['t_mod_vert_skin'], h_skl) + adapter(h)
+            f_skl = block_skl(h_skl, in_dicts['t_mod_skl'], h_skin)
+            in_dicts[''], in_dicts['_vert_skin'], in_dicts['_skl'] = f, f_skin, f_skl
+        for postfix in postfix_keys:
+            in_dicts[postfix] = self.forward_stage(
+                in_dicts[postfix],
+                t,
+                postfix,
+                stage='out',
+                t_emb=in_dicts[f't_emb{postfix}'],
+                skips=in_dicts[f'skips{postfix}'],
+                original_dtype=x.dtype,
+            )
+        if self.predict_x0:
+            t_normalized = t / self.t_scale
+            factor = (1 / t_normalized.clamp_min(self.t_eps))[:, None]
+            in_dicts['_vert_skin'] = in_dicts['_vert_skin'].replace((in_dicts['_vert_skin'].feats - xt_feats_skin) * factor[in_dicts['_vert_skin'].coords[:, 0]])
+            in_dicts['_skl'] = in_dicts['_skl'].replace((in_dicts['_skl'].feats - xt_feats_skl) * factor[in_dicts['_skl'].coords[:, 0]])
+        x_out = x.replace(torch.cat([in_dicts[''].feats, in_dicts['_vert_skin'].feats], dim=1))
+        x_skl_out = x_skl.replace(in_dicts['_skl'].feats)
+        return x_out, x_skl_out
+class AniGenElasticSLatFlowModel(SparseTransformerElasticMixin, AniGenSLatFlowModel):
+    """
+    SLat Flow Model with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

anigen/models/sparse_elastic_mixin.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from contextlib import contextmanager
+from typing import *
+import math
+from ..modules import sparse as sp
+from ..utils.elastic_utils import ElasticModuleMixin
+class SparseTransformerElasticMixin(ElasticModuleMixin):
+    def _get_input_size(self, x: sp.SparseTensor, *args, **kwargs):
+        return x.feats.shape[0]
+    @contextmanager
+    def with_mem_ratio(self, mem_ratio=1.0):
+        if mem_ratio == 1.0:
+            yield 1.0
+            return
+        num_blocks = len(self.blocks)
+        num_checkpoint_blocks = min(math.ceil((1 - mem_ratio) * num_blocks) + 1, num_blocks)
+        exact_mem_ratio = 1 - (num_checkpoint_blocks - 1) / num_blocks
+        for i in range(num_blocks):
+            self.blocks[i].use_checkpoint = i < num_checkpoint_blocks
+        yield exact_mem_ratio
+        for i in range(num_blocks):
+            self.blocks[i].use_checkpoint = False

anigen/models/structured_latent_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .anigen_encoder import AniGenElasticSLatEncoder
+from .anigen_decoder import AniGenElasticSLatMeshDecoder
+from .skin_models import SkinAutoEncoder

anigen/models/structured_latent_vae/anigen_base.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ...modules import sparse as sp
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules.sparse.transformer import SparseTransformerMultiContextCrossBlock, SparseTransformerBlock
+from ...modules.transformer import AbsolutePositionEmbedder, TransformerCrossBlock
+class FreqPositionalEmbedder(nn.Module):
+    def __init__(self, in_dim, include_input=True, max_freq_log2=8, num_freqs=8, log_sampling=True, periodic_fns=None):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = None
+        self.include_input = include_input
+        self.max_freq_log2 = max_freq_log2
+        self.num_freqs = num_freqs
+        self.log_sampling = log_sampling
+        self.periodic_fns = periodic_fns if periodic_fns is not None else [
+            torch.sin, torch.cos
+        ]
+        self.create_embedding_fn()
+    def create_embedding_fn(self):
+        embed_fns = []
+        d = self.in_dim
+        out_dim = 0
+        if self.include_input:
+            embed_fns.append(lambda x: x)
+            out_dim += d
+        max_freq = self.max_freq_log2
+        N_freqs = self.num_freqs
+        if self.log_sampling:
+            freq_bands = 2. ** torch.linspace(0., max_freq, steps=N_freqs)
+        else:
+            freq_bands = torch.linspace(2. ** 0., 2. ** max_freq, steps=N_freqs)
+        for freq in freq_bands:
+            for p_fn in self.periodic_fns:
+                embed_fns.append(lambda x, p_fn=p_fn, freq=freq: p_fn(x * freq))
+                out_dim += d
+        self.embed_fns = embed_fns
+        self.out_dim = out_dim
+    def forward(self, inputs):
+        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)
+def block_attn_config(self, attn_mode_attr='attn_mode'):
+    """
+    Return the attention configuration of the model.
+    """
+    attn_mode = getattr(self, attn_mode_attr)
+    for i in range(self.num_blocks):
+        if attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif attn_mode == "full":
+            yield "full", None, None, None, None
+        elif attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class AniGenSparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        in_channels_skl: int,
+        in_channels_skin: int,
+        model_channels: int,
+        model_channels_skl: int,
+        model_channels_skin: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_heads_skl: int = 8,
+        num_heads_skin: int = 8,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        attn_mode_cross: Literal["full", "serialized", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        skin_cross_from_geo: bool = True,
+        skl_cross_from_geo: bool = True,
+        skin_skl_cross: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.in_channels_skl = in_channels_skl
+        self.in_channels_skin = in_channels_skin
+        self.model_channels = model_channels
+        self.model_channels_skl = model_channels_skl
+        self.model_channels_skin = model_channels_skin
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.attn_mode_cross = attn_mode_cross
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.skin_cross_from_geo = skin_cross_from_geo
+        self.skl_cross_from_geo = skl_cross_from_geo
+        self.skin_skl_cross = skin_skl_cross
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+            self.pos_embedder_skl = AbsolutePositionEmbedder(model_channels_skl)
+            self.pos_embedder_skin = AbsolutePositionEmbedder(model_channels_skin)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.input_layer_skl = sp.SparseLinear(in_channels_skl, model_channels_skl)
+        self.input_layer_skin = sp.SparseLinear(in_channels_skin, model_channels_skin)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+        ctx_channels = []
+        if skin_skl_cross:
+            ctx_channels.append(model_channels_skl)
+        if skin_cross_from_geo:
+            ctx_channels.append(model_channels)
+        self.blocks_skin = nn.ModuleList([
+            SparseTransformerMultiContextCrossBlock(
+                model_channels_skin,
+                ctx_channels=ctx_channels,
+                num_heads=num_heads_skin,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                attn_mode_cross=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+                cross_attn_cache_suffix='_skin',
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self, "attn_mode_cross")
+        ])
+        ctx_channels = []
+        if skin_skl_cross:
+            ctx_channels.append(model_channels_skin)
+        if skl_cross_from_geo:
+            ctx_channels.append(model_channels)
+        self.blocks_skl = nn.ModuleList([
+            SparseTransformerMultiContextCrossBlock(
+                model_channels_skl,
+                ctx_channels=ctx_channels,
+                num_heads=num_heads_skl,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                attn_mode_cross=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+                cross_attn_cache_suffix='_skl',
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self, "attn_mode_cross")
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+        self.blocks_skl.apply(convert_module_to_f16)
+        self.blocks_skin.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+        self.blocks_skl.apply(convert_module_to_f32)
+        self.blocks_skin.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward_input_layer(self, x: sp.SparseTensor, layer, pos_embedder) -> sp.SparseTensor:
+        h = layer(x)
+        if self.pe_mode == "ape":
+            h = h + pos_embedder(x.coords[:, 1:])
+        h = h.type(self.dtype)
+        return h
+    def forward(self, x: sp.SparseTensor, x_skl: sp.SparseTensor, x_skin: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.forward_input_layer(x, self.input_layer, self.pos_embedder)
+        h_skl = self.forward_input_layer(x_skl, self.input_layer_skl, self.pos_embedder_skl)
+        h_skin = self.forward_input_layer(x_skin, self.input_layer_skin, self.pos_embedder_skin)
+        for block, block_skl, block_skin in zip(self.blocks, self.blocks_skl, self.blocks_skin):
+            f, f_skl, f_skin = h, h_skl, h_skin
+            h = block(f)
+            skl_contexts, skin_contexts = [], []
+            if self.skin_skl_cross:
+                skl_contexts.append(f_skin)
+                skin_contexts.append(f_skl)
+            if self.skl_cross_from_geo:
+                skl_contexts.append(f)
+            if self.skin_cross_from_geo:
+                skin_contexts.append(f)
+            h_skl = block_skl(f_skl, skl_contexts)
+            h_skin = block_skin(f_skin, skin_contexts)
+        return h, h_skl, h_skin

anigen/models/structured_latent_vae/anigen_decoder.py ADDED Viewed

	@@ -0,0 +1,834 @@

+from typing import *
+import contextlib
+import torch
+import torch.nn as nn
+from ...modules.sparse.transformer import SparseTransformerMultiContextCrossBlock, SparseTransformerBlock
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...representations import MeshExtractResult
+from ...representations.mesh import AniGenSparseFeatures2Mesh, AniGenSklFeatures2Skeleton
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from pytorch3d.ops import knn_points
+from .anigen_base import AniGenSparseTransformerBase, FreqPositionalEmbedder
+from .skin_models import SKIN_MODEL_DICT
+import torch.nn.functional as F
+from ...representations.skeleton.grouping import GROUPING_STRATEGIES
+class SparseSubdivideBlock3d(nn.Module):
+    """
+    A 3D subdivide block that can subdivide the sparse tensor.
+    Args:
+        channels: channels in the inputs and outputs.
+        out_channels: if specified, the number of output channels.
+        num_groups: the number of groups for the group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        resolution: int,
+        out_channels: Optional[int] = None,
+        num_groups: int = 32,
+        sub_divide: bool = True,
+        conv_as_residual: bool = False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.resolution = resolution
+        self.out_resolution = resolution * 2 if sub_divide else resolution
+        self.out_channels = out_channels or channels
+        self.sub_divide = sub_divide
+        self.conv_as_residual = conv_as_residual
+        self.act_layers = nn.Sequential(
+            sp.SparseGroupNorm32(num_groups, channels),
+            sp.SparseSiLU()
+        )
+        self.sub = sp.SparseSubdivide() if sub_divide else nn.Identity()
+        self.out_layers = nn.Sequential(
+            sp.SparseConv3d(channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}"),
+            sp.SparseGroupNorm32(num_groups, self.out_channels),
+            sp.SparseSiLU(),
+            zero_module(sp.SparseConv3d(self.out_channels, self.out_channels, 3, indice_key=f"res_{self.out_resolution}")),
+        )
+        if self.out_channels == channels and not self.conv_as_residual:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = sp.SparseConv3d(channels, self.out_channels, 1, indice_key=f"res_{self.out_resolution}")
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+SparseConv3d
+        Args:
+            x: an [N x C x ...] Tensor of features.
+        Returns:
+            an [N x C x ...] Tensor of outputs.
+        """
+        h = self.act_layers(x)
+        h = self.sub(h)
+        x = self.sub(x)
+        h = self.out_layers(h)
+        h = h + self.skip_connection(x)
+        return h
+class SparseDownsampleWithCache(nn.Module):
+    """SparseDownsample that stores upsample caches under a unique suffix.
+    This avoids cache-key collisions when stacking multiple down/up stages.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, ...], List[int]], cache_suffix: str):
+        super().__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+        self.cache_suffix = cache_suffix
+        self._down = sp.SparseDownsample(self.factor)
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        out = self._down(x)
+        dim = out.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * dim
+        k_coords = f'upsample_{factor}_coords'
+        k_layout = f'upsample_{factor}_layout'
+        k_idx = f'upsample_{factor}_idx'
+        coords = out.get_spatial_cache(k_coords)
+        layout = out.get_spatial_cache(k_layout)
+        idx = out.get_spatial_cache(k_idx)
+        if any(v is None for v in [coords, layout, idx]):
+            raise ValueError('Downsample cache not found after SparseDownsample.')
+        # spconv expects int32 indices; SparseDownsample produces int64 coords.
+        if out.coords.dtype != torch.int32:
+            out = sp.SparseTensor(
+                out.feats,
+                out.coords.to(torch.int32),
+                out.shape,
+                out.layout,
+                scale=out._scale,
+                spatial_cache=out._spatial_cache,
+            )
+        out.register_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_coords', coords)
+        out.register_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_layout', layout)
+        out.register_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_idx', idx)
+        # Remove unsuffixed keys to prevent later stages overwriting them.
+        try:
+            del out._spatial_cache[k_coords]
+            del out._spatial_cache[k_layout]
+            del out._spatial_cache[k_idx]
+        except Exception:
+            pass
+        return out
+class SparseUpsampleWithCache(nn.Module):
+    """SparseUpsample that reads upsample caches under a unique suffix."""
+    def __init__(self, factor: Union[int, Tuple[int, ...], List[int]], cache_suffix: str):
+        super().__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+        self.cache_suffix = cache_suffix
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        dim = x.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * dim
+        new_coords = x.get_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_coords')
+        new_layout = x.get_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_layout')
+        idx = x.get_spatial_cache(f'upsample_{factor}_{self.cache_suffix}_idx')
+        if any(v is None for v in [new_coords, new_layout, idx]):
+            raise ValueError('Upsample cache not found. Must be paired with SparseDownsampleWithCache.')
+        if new_coords.dtype != torch.int32:
+            new_coords = new_coords.to(torch.int32)
+        new_feats = x.feats[idx]
+        out = sp.SparseTensor(new_feats, new_coords, x.shape, new_layout)
+        out._scale = tuple([s * f for s, f in zip(x._scale, factor)])
+        out._spatial_cache = x._spatial_cache
+        return out
+class SparseSkinUNetNLevel(nn.Module):
+    """A simple N-down/N-up sparse UNet for local smoothing.
+    Note: `SparseSubdivideBlock3d` uses `resolution` only to name spconv `indice_key`s.
+    We must provide distinct (and stage-appropriate) values per hierarchy to avoid
+    rulebook collisions across different coordinate sets.
+    """
+    def __init__(self, channels: int, base_resolution: int, num_groups: int = 32, num_levels: int = 3):
+        super().__init__()
+        if num_levels < 1:
+            raise ValueError(f"num_levels must be >= 1, got {num_levels}")
+        self.channels = channels
+        self.base_resolution = int(base_resolution)
+        self.num_groups = num_groups
+        self.num_levels = int(num_levels)
+        def res_block(resolution: int):
+            return SparseSubdivideBlock3d(
+                channels=channels,
+                resolution=resolution,
+                out_channels=channels,
+                sub_divide=False,
+                conv_as_residual=True,
+                num_groups=num_groups,
+            )
+        # resolutions[i] corresponds to the i-th encoder stage (before downsample)
+        resolutions: List[int] = [max(1, self.base_resolution // (2 ** i)) for i in range(self.num_levels)]
+        bottom_resolution = max(1, self.base_resolution // (2 ** self.num_levels))
+        self.enc = nn.ModuleList([res_block(r) for r in resolutions])
+        self.down = nn.ModuleList([SparseDownsampleWithCache(2, f'unet{i}') for i in range(self.num_levels)])
+        self.mid = res_block(bottom_resolution)
+        # Decoder blocks operate at the same resolutions as encoder blocks.
+        self.up = nn.ModuleList([SparseUpsampleWithCache(2, f'unet{i}') for i in range(self.num_levels)])
+        self.fuse = nn.ModuleList([sp.SparseLinear(channels * 2, channels) for _ in range(self.num_levels)])
+        self.dec = nn.ModuleList([res_block(r) for r in resolutions])
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        in_dtype = x.feats.dtype
+        if x.coords.dtype != torch.int32:
+            x = sp.SparseTensor(
+                x.feats,
+                x.coords.to(torch.int32),
+                x.shape,
+                x.layout,
+                scale=x._scale,
+                spatial_cache=x._spatial_cache,
+            )
+        # spconv implicit_gemm has a runtime tuner that can fail for some sparse
+        # rulebooks under AMP + fp16/bf16. Running UNet convs in fp32 avoids that.
+        if hasattr(torch, 'autocast'):
+            autocast_ctx = torch.autocast(device_type=x.device.type, enabled=False)
+        else:
+            # Older torch fallback
+            autocast_ctx = torch.cuda.amp.autocast(enabled=False) if x.device.type == 'cuda' else contextlib.nullcontext()
+        with autocast_ctx:
+            x_fp32 = x if x.feats.dtype == torch.float32 else x.replace(x.feats.float())
+            skips: List[sp.SparseTensor] = []
+            h = x_fp32
+            for i in range(self.num_levels):
+                s = self.enc[i](h)
+                skips.append(s)
+                h = self.down[i](s)
+            h = self.mid(h)
+            for i in reversed(range(self.num_levels)):
+                h_up = self.up[i](h)
+                s = skips[i]
+                h = self.fuse[i](h_up.replace(torch.cat([h_up.feats, s.feats], dim=-1)))
+                h = self.dec[i](h)
+            u0 = h
+        if in_dtype != u0.feats.dtype:
+            u0 = u0.replace(u0.feats.to(dtype=in_dtype))
+        return u0
+class AniGenSLatMeshDecoder(AniGenSparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        model_channels: int,
+        model_channels_skl: int,
+        model_channels_skin: int,
+        latent_channels: int,
+        latent_channels_skl: int,
+        latent_channels_vertskin: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        num_heads_skl: int = 32,
+        num_heads_skin: int = 32,
+        skin_cross_from_groupped: bool = False,
+        h_skin_unet_num_levels: int = 4,
+        skin_decoder_config: Optional[Dict[str, Any]] = {},
+        upsample_skl: bool = False,
+        skl_defined_on_center: bool = True,
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        attn_mode_cross: Literal["full", "serialized", "windowed"] = "full",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        representation_config: dict = None,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        modules_to_freeze: Optional[List[str]] = ["blocks", "upsample", "out_layer", "skin_decoder"],
+        skin_cross_from_geo: bool = False,
+        skl_cross_from_geo: bool = False,
+        skin_skl_cross: bool = False,
+        skin_ae_name: str = "SkinAE",
+        normalize_z: bool = False,
+        normalize_scale: float = 1.0,
+        jp_residual_fields: bool = True,
+        jp_hyper_continuous: bool = True,
+        grouping_strategy: Literal["mean_shift", "threshold"] = "mean_shift",
+        vertex_skin_feat_interp_sparse: bool = False,
+        vertex_skin_feat_interp_nearest: bool = False,
+        vertex_skin_feat_interp_use_deformed_grid: bool = False,
+        vertex_skin_feat_interp_trilinear: bool = False,
+        flexicube_disable_deform: bool = False,
+        vertex_skin_feat_nodeform_trilinear: bool = False,
+    ):
+        super().__init__(
+            in_channels=latent_channels,
+            in_channels_skl=latent_channels_skl,
+            in_channels_skin=latent_channels_vertskin,
+            model_channels=model_channels,
+            model_channels_skl=model_channels_skl,
+            model_channels_skin=model_channels_skin,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_heads_skl=num_heads_skl,
+            num_heads_skin=num_heads_skin,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            attn_mode_cross=attn_mode_cross,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+            skin_cross_from_geo=skin_cross_from_geo,
+            skl_cross_from_geo=skl_cross_from_geo,
+            skin_skl_cross=skin_skl_cross,
+        )
+        self.pretrain_class_name = ["AniGenElasticSLatMeshDecoder", skin_ae_name]
+        self.pretrain_ckpt_filter_prefix = {skin_ae_name: "skin_decoder"}
+        self.latent_channels = latent_channels
+        self.latent_channels_skl = latent_channels_skl
+        self.latent_channels_vertskin = latent_channels_vertskin
+        self.jp_residual_fields = jp_residual_fields
+        self.jp_hyper_continuous = jp_hyper_continuous
+        self.grouping_func = GROUPING_STRATEGIES[grouping_strategy]
+        self.skin_cross_from_groupped = skin_cross_from_groupped
+        self.normalize_z = normalize_z
+        self.normalize_scale = normalize_scale
+        skin_decoder_config['use_fp16'] = use_fp16
+        self.skin_decoder = SKIN_MODEL_DICT[skin_decoder_config.pop('model_type')](**skin_decoder_config)
+        self.skin_feat_channels = self.skin_decoder.skin_feat_channels
+        # Optional local smoothing UNet on h_skin (independent of grouped cross-attn).
+        # If `h_skin_unet_num_levels < 0`, UNet is disabled.
+        self.h_skin_unet_num_levels = int(h_skin_unet_num_levels)
+        if self.h_skin_unet_num_levels >= 1:
+            self.h_skin_unet = SparseSkinUNetNLevel(
+                model_channels_skin,
+                base_resolution=resolution,
+                num_levels=self.h_skin_unet_num_levels,
+            )
+        else:
+            self.h_skin_unet = None
+        if self.skin_cross_from_groupped:
+            # Trainable parent feature for root joints (where parent_idx < 0).
+            self.root_parent_feat = nn.Parameter(torch.zeros(self.skin_feat_channels))
+            # Joint feature preprocessing: [joint_skin, fourier(joint_xyz), parent_skin] -> proj -> self-attn
+            self.joints_pos_embedder = FreqPositionalEmbedder(
+                in_dim=3,
+                include_input=True,
+                max_freq_log2=6,
+                num_freqs=6,
+                log_sampling=True,
+            )
+            joints_pe_dim = self.joints_pos_embedder.out_dim
+            joints_in_dim = self.skin_feat_channels + joints_pe_dim + self.skin_feat_channels
+            self.joints_ctx_channels = model_channels_skin
+            self.joints_in_proj = nn.Sequential(
+                nn.Linear(joints_in_dim, self.joints_ctx_channels, bias=True),
+                nn.SiLU(),
+                nn.LayerNorm(self.joints_ctx_channels, elementwise_affine=True),
+            )
+            self.joints_self_attn = nn.ModuleList([
+                SparseTransformerBlock(
+                    self.joints_ctx_channels,
+                    num_heads=num_heads_skin,
+                    mlp_ratio=self.mlp_ratio,
+                    attn_mode="full",
+                    window_size=None,
+                    use_checkpoint=self.use_checkpoint,
+                    use_rope=False,
+                    qk_rms_norm=self.qk_rms_norm,
+                    ln_affine=True,
+                ) for _ in range(4)
+            ])
+            # Coordinate PE for h_skin before cross-attn: coords in [-1, 1] -> Fourier PE -> proj(C), concat, fuse back to C.
+            self.h_skin_coord_embedder = FreqPositionalEmbedder(
+                in_dim=3,
+                include_input=True,
+                max_freq_log2=6,
+                num_freqs=6,
+                log_sampling=True,
+            )
+            h_skin_pe_dim = self.h_skin_coord_embedder.out_dim
+            self.h_skin_coord_proj = nn.Linear(h_skin_pe_dim, model_channels_skin, bias=True)
+            self.h_skin_coord_fuse = sp.SparseLinear(model_channels_skin * 2, model_channels_skin)
+            self.skin_cross_groupped_net = SparseTransformerMultiContextCrossBlock(
+                model_channels_skin,
+                # Context includes processed joint tokens + raw joint skin feats (skip connection).
+                ctx_channels=[self.joints_ctx_channels + self.skin_feat_channels],
+                num_heads=num_heads_skin,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode="full",
+                attn_mode_cross="full",
+                cross_attn_cache_suffix='_skin_cross_from_groupped',
+            )
+        self.resolution = resolution
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch
+        self.upsample_skl = upsample_skl
+        self.rep_config = representation_config
+        self.mesh_extractor = AniGenSparseFeatures2Mesh(
+            res=self.resolution*4,
+            use_color=self.rep_config.get('use_color', False),
+            skin_feat_channels=self.skin_feat_channels,
+            predict_skin=True,
+            vertex_skin_feat_interp_sparse=vertex_skin_feat_interp_sparse,
+            vertex_skin_feat_interp_nearest=vertex_skin_feat_interp_nearest,
+            vertex_skin_feat_interp_use_deformed_grid=vertex_skin_feat_interp_use_deformed_grid,
+            vertex_skin_feat_interp_trilinear=vertex_skin_feat_interp_trilinear,
+            flexicube_disable_deform=flexicube_disable_deform,
+            vertex_skin_feat_nodeform_trilinear=vertex_skin_feat_nodeform_trilinear,
+        )
+        self.out_channels = self.mesh_extractor.feats_channels
+        self.upsample = nn.ModuleList([
+            SparseSubdivideBlock3d(
+                channels=model_channels,
+                resolution=resolution,
+                out_channels=model_channels // 4
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                resolution=resolution * 2,
+                out_channels=model_channels // 8
+            )
+        ])
+        upsample_skin_blocks = []
+        upsample_skin_blocks.extend([
+            SparseSubdivideBlock3d(
+                channels=model_channels_skin,
+                resolution=resolution,
+                out_channels=model_channels // 4
+            ),
+            SparseSubdivideBlock3d(
+                channels=model_channels // 4,
+                resolution=resolution * 2,
+                out_channels=model_channels // 8
+            )
+        ])
+        self.upsample_skin_net = nn.ModuleList(upsample_skin_blocks)
+        self.out_layer = sp.SparseLinear(model_channels // 8, self.out_channels)
+        self.out_layer_skin = sp.SparseLinear(model_channels // 8, self.skin_feat_channels*8)
+        self.out_layer_skl_skin = sp.SparseLinear(model_channels // 8 if upsample_skl else model_channels_skl, self.skin_feat_channels if skl_defined_on_center else self.skin_feat_channels * 8)
+        self.use_conf_jp   = self.rep_config.get('use_conf_jp', False) or self.jp_hyper_continuous
+        self.use_conf_skin = self.rep_config.get('use_conf_skin', False)
+        res_skl = self.resolution * 4 if self.upsample_skl else self.resolution
+        self.skeleton_extractor = AniGenSklFeatures2Skeleton(skin_feat_channels=self.skin_feat_channels, device=self.device, res=res_skl, use_conf_jp=self.use_conf_jp, use_conf_skin=self.use_conf_skin, predict_skin=True, defined_on_center=skl_defined_on_center, jp_hyper_continuous=self.jp_hyper_continuous, jp_residual_fields=self.jp_residual_fields)
+        self.out_channels_skl = self.skeleton_extractor.feats_channels
+        if self.upsample_skl:
+            self.upsample_skl_net = nn.ModuleList([
+                SparseSubdivideBlock3d(
+                    channels=model_channels_skl,
+                    resolution=resolution,
+                    out_channels=model_channels // 4
+                ),
+                SparseSubdivideBlock3d(
+                    channels=model_channels // 4,
+                    resolution=resolution * 2,
+                    out_channels=model_channels // 8
+                )
+            ])
+            self.out_layer_skl = sp.SparseLinear(model_channels // 8, self.out_channels_skl)
+        else:
+            self.out_layer_skl = sp.SparseLinear(model_channels_skl, self.out_channels_skl)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        else:
+            self.convert_to_fp32()
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for name, param in m.named_parameters():
+                                if 'lora' not in name:
+                                    param.requires_grad = False
+                    elif isinstance(mod, nn.Module):
+                        for name, param in mod.named_parameters():
+                            if 'lora' not in name:
+                                param.requires_grad = False
+                    elif isinstance(mod, torch.Tensor):
+                        if mod.requires_grad:
+                            mod.requires_grad = False
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        scale = 1e-4
+        # Kaiming initialization for output layers (better for ReLU/SiLU-like activations)
+        nn.init.kaiming_normal_(self.out_layer.weight, mode='fan_in', nonlinearity='relu')
+        self.out_layer.weight.data.mul_(scale)
+        nn.init.constant_(self.out_layer.bias, 0)
+        nn.init.kaiming_normal_(self.out_layer_skl.weight, mode='fan_in', nonlinearity='relu')
+        self.out_layer_skl.weight.data.mul_(scale)
+        nn.init.constant_(self.out_layer_skl.bias, 0)
+        # Initialize skin layer:
+        self.skin_decoder.initialize_weights()
+        nn.init.kaiming_normal_(self.out_layer_skin.weight, mode='fan_in', nonlinearity='relu')
+        self.out_layer_skin.weight.data.mul_(scale)
+        nn.init.constant_(self.out_layer_skin.bias, 0)
+        nn.init.kaiming_normal_(self.out_layer_skl_skin.weight, mode='fan_in', nonlinearity='relu')
+        self.out_layer_skl_skin.weight.data.mul_(scale)
+        nn.init.constant_(self.out_layer_skl_skin.bias, 0)
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        super().convert_to_fp16()
+        self.upsample.apply(convert_module_to_f16)
+        self.upsample_skin_net.apply(convert_module_to_f16)
+        if self.upsample_skl:
+            self.upsample_skl_net.apply(convert_module_to_f16)
+        if self.skin_cross_from_groupped:
+            # Joint preprocessing and cross-attn should match model dtype.
+            self.root_parent_feat.data = self.root_parent_feat.data.half()
+            self.joints_in_proj.apply(convert_module_to_f16)
+            self.joints_self_attn.apply(convert_module_to_f16)
+            # `convert_module_to_f16` doesn't include `nn.LayerNorm`, so cast LN params explicitly.
+            for _m in self.joints_in_proj.modules():
+                if isinstance(_m, nn.LayerNorm):
+                    if _m.weight is not None:
+                        _m.weight.data = _m.weight.data.half()
+                    if _m.bias is not None:
+                        _m.bias.data = _m.bias.data.half()
+            # IMPORTANT: `SparseTransformerBlock` uses `LayerNorm32` which internally
+            # normalizes in fp32 (`x.float()`), so its parameters must stay fp32.
+            for _m in self.joints_self_attn.modules():
+                if isinstance(_m, nn.LayerNorm):
+                    if _m.weight is not None:
+                        _m.weight.data = _m.weight.data.float()
+                    if _m.bias is not None:
+                        _m.bias.data = _m.bias.data.float()
+            self.skin_cross_groupped_net.apply(convert_module_to_f16)
+            self.h_skin_coord_proj.apply(convert_module_to_f16)
+            self.h_skin_coord_fuse.apply(convert_module_to_f16)
+        # UNet is executed in fp32 (see `SparseSkinUNetNLevel.forward`), so keep its
+        # weights in fp32 to avoid dtype mismatches inside spconv.
+        if self.h_skin_unet is not None:
+            self.h_skin_unet.apply(convert_module_to_f32)
+        self.skin_decoder.convert_to_fp16()
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        super().convert_to_fp32()
+        self.upsample.apply(convert_module_to_f32)
+        self.upsample_skin_net.apply(convert_module_to_f32)
+        if self.upsample_skl:
+            self.upsample_skl_net.apply(convert_module_to_f32)
+        if self.skin_cross_from_groupped:
+            self.root_parent_feat.data = self.root_parent_feat.data.float()
+            self.joints_in_proj.apply(convert_module_to_f32)
+            self.joints_self_attn.apply(convert_module_to_f32)
+            for _m in self.joints_in_proj.modules():
+                if isinstance(_m, nn.LayerNorm):
+                    if _m.weight is not None:
+                        _m.weight.data = _m.weight.data.float()
+                    if _m.bias is not None:
+                        _m.bias.data = _m.bias.data.float()
+            for _m in self.joints_self_attn.modules():
+                if isinstance(_m, nn.LayerNorm):
+                    if _m.weight is not None:
+                        _m.weight.data = _m.weight.data.float()
+                    if _m.bias is not None:
+                        _m.bias.data = _m.bias.data.float()
+            self.skin_cross_groupped_net.apply(convert_module_to_f32)
+            self.h_skin_coord_proj.apply(convert_module_to_f32)
+            self.h_skin_coord_fuse.apply(convert_module_to_f32)
+        if self.h_skin_unet is not None:
+            self.h_skin_unet.apply(convert_module_to_f32)
+        self.skin_decoder.convert_to_fp32()
+    def to_representation(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        """
+        Convert a batch of network outputs to 3D representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            mesh = self.mesh_extractor(x[i], training=self.training)
+            ret.append(mesh)
+        return ret
+    def to_representation_skl(self, x: sp.SparseTensor) -> List[MeshExtractResult]:
+        """
+        Convert a batch of network outputs to skeleton representations.
+        Args:
+            x: The [N x * x C] sparse tensor output by the network.
+        Returns:
+            list of skeleton representations
+        """
+        ret = []
+        for i in range(x.shape[0]):
+            skl = self.skeleton_extractor(x[i], training=self.training)
+            ret.append(skl)
+        return ret
+    def forward(self, x: sp.SparseTensor, x_skl: sp.SparseTensor, gt_joints=None, gt_parents=None) -> List[MeshExtractResult]:
+        x0 = x
+        x_skin = sp.SparseTensor(feats=x0.feats[:, self.latent_channels:], coords=x0.coords.clone())
+        x = x0.replace(x0.feats[:, :self.latent_channels])
+        if self.normalize_z:
+            x_skin = x_skin.replace(F.normalize(x_skin.feats, dim=-1))
+            x_skl = x_skl.replace(F.normalize(x_skl.feats, dim=-1))
+        # Backbone forward
+        h, h_skl, h_skin = super().forward(x, x_skl, x_skin)
+        # Optional smoothing on h_skin.
+        if self.h_skin_unet is not None:
+            h_skin = self.h_skin_unet(h_skin)
+        # Skeleton prediction
+        if self.upsample_skl:
+            for block_skl in self.upsample_skl_net:
+                h_skl = block_skl(h_skl)
+        h_skl_middle = h_skl.type(x_skl.dtype)
+        h_skl = self.out_layer_skl(h_skl_middle)
+        h_skl_skin = self.out_layer_skl_skin(h_skl_middle)
+        h_skl = h_skl.replace(torch.cat([h_skl.feats, h_skl_skin.feats], dim=-1))
+        skeletons = self.to_representation_skl(h_skl)
+        skin_feats_joints_list = self.skeleton_grouping(skeletons, gt_joints=gt_joints, gt_parents=gt_parents)
+        # Skin cross with grouped joint features
+        if self.skin_cross_from_groupped:
+            coords_xyz = h_skin.coords[:, 1:].to(device=h_skin.device, dtype=torch.float32)
+            coords_norm = (coords_xyz + 0.5) / self.resolution * 2.0 - 1.0
+            coords_pe = self.h_skin_coord_embedder(coords_norm)
+            coords_pe = coords_pe.to(device=h_skin.device, dtype=h_skin.feats.dtype)
+            coords_pe = self.h_skin_coord_proj(coords_pe)
+            h_skin = h_skin.replace(torch.cat([h_skin.feats, coords_pe], dim=-1))
+            h_skin = self.h_skin_coord_fuse(h_skin)
+            joints_ctx = self._build_processed_joints_context(
+                skeletons,
+                skin_feats_joints_list,
+                device=h_skin.device,
+                dtype=h_skin.feats.dtype,
+            )
+            h_skin = self.skin_cross_groupped_net(h_skin, [joints_ctx])
+        for block in self.upsample_skin_net:
+            h_skin = block(h_skin)
+        h_skin = h_skin.type(x.dtype)
+        h_skin = self.out_layer_skin(h_skin)
+        # Mesh prediction
+        for block in self.upsample:
+            h = block(h)
+        h_middle = h.type(x.dtype)
+        h = self.out_layer(h_middle)
+        h = h.replace(torch.cat([h.feats, h_skin.feats], dim=-1))
+        meshes = self.to_representation(h)
+        self.skinweight_forward(meshes, skeletons, gt_joints=gt_joints, gt_parents=gt_parents)
+        return meshes, skeletons
+    def _joints_feats_list_to_sparse(
+        self,
+        joints_feats_list: List[torch.Tensor],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> sp.SparseTensor:
+        if device is None:
+            device = self.device
+        if dtype is None:
+            dtype = self.dtype
+        feats_per_batch: List[torch.Tensor] = []
+        for joints_feats in joints_feats_list:
+            joints_feats = joints_feats.to(device=device, dtype=dtype)
+            feats_per_batch.append(joints_feats)
+        feats = torch.cat(feats_per_batch, dim=0)
+        # Coords are [batch, x, y, z]. We encode token index into x and keep y/z = 0.
+        batch_indices: List[torch.Tensor] = []
+        x_indices: List[torch.Tensor] = []
+        for bi, joints_feats in enumerate(feats_per_batch):
+            ji = int(joints_feats.shape[0])
+            batch_indices.append(torch.full((ji,), bi, device=device, dtype=torch.int32))
+            x_indices.append(torch.arange(ji, device=device, dtype=torch.int32))
+        b = torch.cat(batch_indices, dim=0)
+        x = torch.cat(x_indices, dim=0)
+        yz = torch.zeros((x.shape[0], 2), device=device, dtype=torch.int32)
+        coords = torch.cat([b[:, None], x[:, None], yz], dim=1)
+        return sp.SparseTensor(feats=feats, coords=coords)
+    def _build_processed_joints_context(
+        self,
+        skeletons: List[Any],
+        skin_feats_joints_list: List[torch.Tensor],
+        device: torch.device,
+        dtype: torch.dtype,
+    ) -> sp.SparseTensor:
+        processed: List[torch.Tensor] = []
+        raw_skin: List[torch.Tensor] = []
+        for rep_skl, skin_feats_joints in zip(skeletons, skin_feats_joints_list):
+            joints = rep_skl.joints_grouped
+            parents = rep_skl.parents_grouped
+            if joints is None or parents is None:
+                raise ValueError('Expected grouped joints/parents for skin_cross_from_groupped.')
+            joints = joints.to(device=device, dtype=dtype)
+            parents = parents.to(device=device)
+            skin_feats_joints = skin_feats_joints.to(device=device, dtype=dtype)
+            raw_skin.append(skin_feats_joints)
+            pe = self.joints_pos_embedder(joints).to(device=device, dtype=dtype)
+            # Parent skin features (root uses trainable parameter)
+            parent_idx = parents.to(torch.long)
+            valid = parent_idx >= 0
+            root_feat = self.root_parent_feat.to(device=device, dtype=dtype)
+            parent_feat_root = root_feat.unsqueeze(0).expand(skin_feats_joints.shape[0], -1)
+            parent_feat_gather = skin_feats_joints[parent_idx.clamp(min=0)]
+            parent_feat = torch.where(valid.unsqueeze(1), parent_feat_gather, parent_feat_root)
+            joint_in = torch.cat([skin_feats_joints, pe, parent_feat], dim=-1)
+            joint_h = self.joints_in_proj(joint_in)
+            processed.append(joint_h)
+        joints_ctx = self._joints_feats_list_to_sparse(processed, device=device, dtype=dtype)
+        for blk in self.joints_self_attn:
+            joints_ctx = blk(joints_ctx)
+        # Skip connection: concatenate original joint skin feats after self-attn.
+        joints_skip = self._joints_feats_list_to_sparse(raw_skin, device=device, dtype=dtype)
+        joints_ctx = joints_ctx.replace(torch.cat([joints_ctx.feats, joints_skip.feats], dim=-1))
+        return joints_ctx
+    def skeleton_grouping(self, reps_skl, gt_joints=None, gt_parents=None, skin_feats_skl_list=None, return_skin_pred_only=False):
+        skin_feats_joints_list = []
+        for i, rep_skl in zip(range(len(reps_skl)), reps_skl):
+            if gt_joints is not None:
+                joints_grouped = gt_joints[i]
+                parents_grouped = gt_parents[i]
+            elif rep_skl.joints_grouped is None:
+                with torch.no_grad():
+                    joints_grouped, parents_grouped = self.grouping_func(joints=rep_skl.joints, parents=rep_skl.parents, joints_conf=rep_skl.conf_j, parents_conf=rep_skl.conf_p)
+            else:
+                joints_grouped = rep_skl.joints_grouped
+                parents_grouped = rep_skl.parents_grouped
+            if not return_skin_pred_only:
+                rep_skl.joints_grouped = joints_grouped
+                rep_skl.parents_grouped = parents_grouped
+            # Calculate NN indices for joints
+            positions_skl = rep_skl.positions
+            _, joints_nn_idx, _ = knn_points(positions_skl[None], joints_grouped[None].detach(), K=1, norm=2, return_nn=False)
+            joints_nn_idx = joints_nn_idx[0, :, 0]
+            skin_feats_skl = rep_skl.skin_feats if skin_feats_skl_list is None else skin_feats_skl_list[i]
+            # Average the predicted joint features
+            conf_skin = torch.sigmoid(rep_skl.conf_skin) if rep_skl.conf_skin is not None else torch.ones_like(skin_feats_skl[:, :1])
+            skin_feats_joints = torch.zeros([joints_grouped.shape[0], skin_feats_skl.shape[-1]], device=self.device, dtype=skin_feats_skl.dtype)
+            skin_feats_square_joints = skin_feats_joints.clone()
+            skin_conf_joints = torch.zeros([joints_grouped.shape[0], 1], device=self.device, dtype=skin_feats_skl.dtype)
+            skin_feats_joints.scatter_add_(0, joints_nn_idx[:, None].expand(-1, skin_feats_skl.shape[-1]), skin_feats_skl * conf_skin)
+            skin_feats_square_joints.scatter_add_(0, joints_nn_idx[:, None].expand(-1, skin_feats_skl.shape[-1]), skin_feats_skl.square() * conf_skin)
+            skin_conf_joints.scatter_add_(0, joints_nn_idx[:, None], conf_skin)
+            skin_feats_joints = skin_feats_joints / skin_conf_joints.clamp(min=1e-6)
+            skin_feats_square_joints = skin_feats_square_joints / skin_conf_joints.clamp(min=1e-6)
+            skin_feats_joints_var = skin_feats_square_joints - skin_feats_joints.square()
+            skin_feats_joints_var_loss = skin_feats_joints_var.mean()
+            if not return_skin_pred_only:
+                rep_skl.skin_feats_joints_var_loss = skin_feats_joints_var_loss
+                rep_skl.skin_feats_joints = skin_feats_joints
+            skin_feats_joints_list.append(skin_feats_joints)
+        return skin_feats_joints_list
+    def skinweight_forward(self, reps, reps_skl, gt_joints=None, gt_parents=None, return_skin_pred_only=False, skin_feats_verts_list=None, skin_feats_skl_list=None, *args, **kwargs):
+        if return_skin_pred_only:
+            skin_preds = []
+        if reps_skl[0].parents_grouped is None or return_skin_pred_only:
+            skin_feats_joints_list = self.skeleton_grouping(reps_skl, gt_joints=gt_joints, gt_parents=gt_parents, skin_feats_skl_list=skin_feats_skl_list, return_skin_pred_only=return_skin_pred_only)
+        else:
+            skin_feats_joints_list = [rep_skl.skin_feats_joints for rep_skl in reps_skl]
+        for i, rep, rep_skl in zip(range(len(reps)), reps, reps_skl):
+            # Joint skinning features
+            skin_feats_joints = skin_feats_joints_list[i]
+            # Vertex skinning features
+            skin_feats_verts = rep.vertex_skin_feats if skin_feats_verts_list is None else skin_feats_verts_list[i]
+            # Predict skin weights
+            parents_grouped = rep_skl.parents_grouped
+            skin_pred = self.skin_decoder(skin_feats_verts[None], skin_feats_joints[None], parents_grouped[None])
+            skin_pred = skin_pred[0]
+            if return_skin_pred_only:
+                skin_preds.append(skin_pred)
+            else:
+                reps_skl[i].skin_pred = skin_pred
+        if return_skin_pred_only:
+            return skin_preds
+class AniGenElasticSLatMeshDecoder(SparseTransformerElasticMixin, AniGenSLatMeshDecoder):
+    """
+    Slat VAE Mesh decoder with elastic memory management.
+    Used for training with low VRAM.
+    """
+    pass

anigen/models/structured_latent_vae/anigen_encoder.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modules import sparse as sp
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from .anigen_base import AniGenSparseTransformerBase, FreqPositionalEmbedder
+from pytorch3d.ops import knn_points
+from .skin_models import SkinEncoder
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class FeedForwardNet(nn.Module):
+    def __init__(self, channels: int, channels_out: int=None, mlp_ratio: float = 4.0):
+        super().__init__()
+        channels_out = channels if channels_out is None else channels_out
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, int(channels * mlp_ratio)),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(int(channels * mlp_ratio), channels_out),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class AniGenSLatEncoder(AniGenSparseTransformerBase):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        model_channels: int,
+        model_channels_skl: int,
+        model_channels_skin: int,
+        latent_channels: int,
+        latent_channels_skl: int,
+        latent_channels_vertskin: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        num_heads_skl: int = 32,
+        num_heads_skin: int = 32,
+        skl_pos_embed_freq: int = 10,
+        skin_encoder_config: Optional[Dict[str, Any]] = {},
+        encode_upsampled_skin_feat: bool = True,
+        skin_ae_name: Optional[str] = "SkinAE",
+        mlp_ratio: float = 4,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",
+        attn_mode_cross: Literal["full", "serialized", "windowed"] = "full",
+        window_size: int = 8,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+        use_pretrain_branch: bool = True,
+        freeze_pretrain_branch: bool = True,
+        modules_to_freeze: Optional[List[str]] = ["input_layer", "blocks", "out_layer", "skin_encoder"],
+        skin_cross_from_geo: bool = True,
+        skl_cross_from_geo: bool = True,
+        skin_skl_cross: bool = True,
+        latent_denoising: bool = True,
+        normalize_z: bool = True,
+        normalize_scale: float = 1.0,
+        jp_residual_fields: bool = False,
+        jp_hyper_continuous: bool = False,
+    ):
+        self.use_pretrain_branch = use_pretrain_branch
+        self.freeze_pretrain_branch = freeze_pretrain_branch
+        self.skl_pos_embed_freq = skl_pos_embed_freq
+        self.latent_denoising = latent_denoising
+        self.normalize_latent = normalize_z and latent_denoising
+        self.normalize_scale = normalize_scale
+        self.jp_residual_fields = jp_residual_fields
+        self.jp_hyper_continuous = jp_hyper_continuous
+        super().__init__(
+            in_channels=in_channels,
+            in_channels_skl=model_channels_skl,
+            in_channels_skin=model_channels_skin,
+            model_channels=model_channels,
+            model_channels_skl=model_channels_skl,
+            model_channels_skin=model_channels_skin,
+            num_blocks=num_blocks,
+            num_heads=num_heads,
+            num_heads_skl=num_heads_skl,
+            num_heads_skin=num_heads_skin,
+            num_head_channels=num_head_channels,
+            mlp_ratio=mlp_ratio,
+            attn_mode=attn_mode,
+            attn_mode_cross=attn_mode_cross,
+            window_size=window_size,
+            pe_mode=pe_mode,
+            use_fp16=use_fp16,
+            use_checkpoint=use_checkpoint,
+            qk_rms_norm=qk_rms_norm,
+            skin_cross_from_geo=skin_cross_from_geo,
+            skl_cross_from_geo=skl_cross_from_geo,
+            skin_skl_cross=skin_skl_cross,
+        )
+        self.pretrain_class_name = ["AniGenElasticSLatEncoder", skin_ae_name]
+        self.pretrain_ckpt_filter_prefix = {skin_ae_name: "skin_encoder"}
+        self.resolution = resolution
+        self.latent_channels = latent_channels
+        self.latent_channels_skl = latent_channels_skl
+        self.latent_channels_vertskin = latent_channels_vertskin
+        skin_encoder_config['use_fp16'] = use_fp16
+        self.skin_encoder = SkinEncoder(**skin_encoder_config)
+        self.encode_upsampled_skin_feat = encode_upsampled_skin_feat
+        self.in_layer_skin = FeedForwardNet(channels=self.skin_encoder.skin_feat_channels * (8 if encode_upsampled_skin_feat else 1), channels_out=model_channels_skin)
+        self.pos_embedder_fourier = FreqPositionalEmbedder(in_dim=4 if self.jp_hyper_continuous else 3, max_freq_log2=self.skl_pos_embed_freq, num_freqs=self.skl_pos_embed_freq, include_input=True)
+        self.root_embedding = nn.Parameter(torch.zeros(1, self.pos_embedder_fourier.out_dim))
+        # Channel Balance
+        self.in_layer_jp_skl = FeedForwardNet(channels=2 * self.pos_embedder_fourier.out_dim, channels_out=model_channels_skl//4)
+        self.in_layer_skin_skl = FeedForwardNet(channels=self.skin_encoder.skin_feat_channels, channels_out=model_channels_skl-(model_channels_skl//4))
+        self.out_layer = sp.SparseLinear(model_channels, 2 * latent_channels)
+        if self.latent_denoising:
+            self.out_layer_skl = sp.SparseLinear(model_channels_skl, latent_channels_skl)
+            self.out_layer_vertskin = sp.SparseLinear(model_channels_skin, latent_channels_vertskin)
+        else:
+            self.out_layer_skl = sp.SparseLinear(model_channels_skl, 2 * latent_channels_skl)
+            self.out_layer_vertskin = sp.SparseLinear(model_channels_skin, 2 * latent_channels_vertskin)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        else:
+            self.convert_to_fp32()
+        if 'all' in modules_to_freeze:
+            modules_to_freeze = list(set([k.split('.')[0] for k in self.state_dict().keys()]))
+            print(f"\033[93mFreezing all modules: {modules_to_freeze}\033[0m")
+        if self.use_pretrain_branch and self.freeze_pretrain_branch:
+            for module in modules_to_freeze:
+                if hasattr(self, module):
+                    mod = getattr(self, module)
+                    if isinstance(mod, nn.ModuleList):
+                        for m in mod:
+                            for name, param in m.named_parameters():
+                                if 'lora' not in name:
+                                    param.requires_grad = False
+                    elif isinstance(mod, nn.Module):
+                        for name, param in mod.named_parameters():
+                            if 'lora' not in name:
+                                param.requires_grad = False
+                    elif isinstance(mod, torch.Tensor):
+                        if mod.requires_grad:
+                            mod.requires_grad = False
+    def initialize_weights(self) -> None:
+        super().initialize_weights()
+        # Zero-out output layers:
+        nn.init.constant_(self.out_layer.weight, 0)
+        nn.init.constant_(self.out_layer.bias, 0)
+    def skeleton_embedding(self, x, x_skl, joints_list, parents_list, skin_list, gt_meshes, bvh_list=None):
+        res = self.resolution
+        feats_new = []
+        feats_skl_new = []
+        coords_new = []
+        coords_skl_new = []
+        joint_skin_embeds, vert_skin_embeds = self.skin_encoder(joints_list, parents_list, skin_list)
+        joints_pos_list = []
+        for i in range(len(joints_list)):
+            parent_idx = parents_list[i].clone()
+            coords_new.append(x[i].coords)
+            coords_skl_new.append(x_skl[i].coords)
+            coords_new[-1][:, 0] = i
+            coords_skl_new[-1][:, 0] = i
+            v_pos = (x[i].coords[:, 1:4] + 0.5) / res - 0.5
+            v_pos_skl = (x_skl[i].coords[:, 1:4] + 0.5) / res - 0.5
+            dist_nn_12, joints_nn_idx, _ = knn_points(v_pos_skl[None], joints_list[i][None], K=2, norm=2, return_nn=False)
+            joints_nn_idx = joints_nn_idx[0, :, 0]
+            # Skeleton positional embedding
+            joints_pos  = joints_list[i][joints_nn_idx]             - (v_pos_skl if self.jp_residual_fields else 0)
+            parents_pos = joints_list[i][parent_idx[joints_nn_idx]] - (v_pos_skl if self.jp_residual_fields else 0)
+            if self.jp_hyper_continuous:
+                factor      = (1 - (dist_nn_12[0, :, 0:1] / (dist_nn_12[0, :, 1:2] + 1e-8)).clamp(max=1.0))
+                joints_pos  = torch.cat([joints_pos, factor], dim=-1)
+                parents_pos = torch.cat([parents_pos, factor], dim=-1)
+            joints_pos_embed  = self.pos_embedder_fourier(joints_pos)
+            parents_pos_embed = self.pos_embedder_fourier(parents_pos)
+            parents_pos_embed = torch.where(parent_idx[joints_nn_idx][:, None] == -1, self.root_embedding.expand_as(parents_pos_embed), parents_pos_embed)
+            jp_pos_embed_nn   = torch.cat([joints_pos_embed, parents_pos_embed], dim=-1)
+            jp_pos_embed_nn = self.in_layer_jp_skl(jp_pos_embed_nn)
+            # Skeleton skin embedding
+            j_skin_embed_nn = joint_skin_embeds[i][joints_nn_idx]
+            j_skin_embed_nn = self.in_layer_skin_skl(j_skin_embed_nn)
+            # Concatenate
+            jp_skl_embed = torch.cat([jp_pos_embed_nn, j_skin_embed_nn], dim=-1)
+            feats_skl_new.append(jp_skl_embed)
+            if self.encode_upsampled_skin_feat:
+                # Create 8 sub-voxel points
+                offsets = torch.tensor([
+                    [-1, -1, -1], [-1, -1, 1], [-1, 1, -1], [-1, 1, 1],
+                    [1, -1, -1], [1, -1, 1], [1, 1, -1], [1, 1, 1]
+                ], device=v_pos.device, dtype=v_pos.dtype) * (0.25 / res)
+                query_pos = v_pos.unsqueeze(1) + offsets.unsqueeze(0) # (N, 8, 3)
+                query_pos_flat = query_pos.view(-1, 3)
+            else:
+                query_pos_flat = v_pos
+            if bvh_list is not None:
+                bvh = bvh_list[i].to(v_pos.device)
+                _, face_id, uvw = bvh.unsigned_distance(query_pos_flat, return_uvw=True)
+                uvw = uvw.clamp(min=0.0)
+                uvw_sum = uvw.sum(dim=-1, keepdim=True).clamp_min(1e-6)
+                uvw = uvw / uvw_sum
+                face_id = gt_meshes[i]['faces'][face_id]
+                voxel_skin_embeds = (vert_skin_embeds[i][face_id] * uvw[..., None]).sum(1)
+            else:
+                gt_mesh_verts = gt_meshes[i]['vertices']
+                _, mesh_nn_idx, _ = knn_points(query_pos_flat[None], gt_mesh_verts[None], K=1, norm=2, return_nn=False)
+                mesh_nn_idx = mesh_nn_idx[0, :, 0]
+                voxel_skin_embeds = vert_skin_embeds[i][mesh_nn_idx]
+            voxel_skin_embeds = voxel_skin_embeds.view(v_pos.shape[0], -1)
+            voxel_skin_embeds = self.in_layer_skin(voxel_skin_embeds)
+            feats_new.append(voxel_skin_embeds)
+            joints_pos_list.append(joints_pos)
+        x_new     = sp.SparseTensor(coords=torch.cat(coords_new, dim=0), feats=torch.cat(feats_new, dim=0))
+        x_skl_new = sp.SparseTensor(coords=torch.cat(coords_skl_new, dim=0), feats=torch.cat(feats_skl_new, dim=0))
+        return x_new, x_skl_new, joint_skin_embeds, vert_skin_embeds, joints_pos_list
+    def encode_sample(self, x: sp.SparseTensor, out_layer: sp.SparseLinear, sample_posterior: bool = True, latent_denoising: bool = False):
+        x = x.type(torch.float32)
+        x = x.replace(F.layer_norm(x.feats, x.feats.shape[-1:]))
+        x = out_layer(x)
+        if latent_denoising:
+            if self.normalize_latent:
+                x = x.replace(nn.functional.normalize(x.feats, dim=-1) * self.normalize_scale)
+            mean, logvar = x.feats, torch.zeros_like(x.feats)
+        else:
+            mean, logvar = x.feats.chunk(2, dim=-1)
+        if sample_posterior and not latent_denoising:
+            std = torch.exp(0.5 * logvar)
+            z = mean + std * torch.randn_like(std)
+        else:
+            z = mean
+        z = x.replace(z)
+        if latent_denoising:
+            mean = mean.detach()
+        return z, mean, logvar
+    def forward(self, x: sp.SparseTensor, x_skl: sp.SparseTensor, sample_posterior=True, return_raw=False, return_skin_encoded=False, **kwargs):
+        x_skin, x_skl, joint_skin_embeds, vert_skin_embeds, joints_pos = self.skeleton_embedding(x, x_skl, kwargs.get('gt_joints'), kwargs.get('gt_parents'), kwargs.get('gt_skin'), kwargs.get('gt_mesh'), kwargs.get('bvh_list', None))
+        h, h_skl, h_skin = super().forward(x, x_skl, x_skin)
+        z, mean, logvar = self.encode_sample(h, self.out_layer, sample_posterior, latent_denoising=False)
+        z_skl, mean_skl, logvar_skl = self.encode_sample(h_skl, self.out_layer_skl, sample_posterior, latent_denoising=self.latent_denoising)
+        z_skin, mean_skin, logvar_skin = self.encode_sample(h_skin, self.out_layer_vertskin, sample_posterior, latent_denoising=self.latent_denoising)
+        z = z.replace(torch.cat([z.feats, z_skin.feats], dim=-1))
+        mean, logvar = torch.cat([mean, mean_skin], dim=-1), torch.cat([logvar, logvar_skin], dim=-1)
+        if not return_skin_encoded:
+            # Ordinary return without skin encoded features
+            if return_raw:
+                return z, mean, logvar, z_skl, mean_skl, logvar_skl, joint_skin_embeds, vert_skin_embeds, joints_pos
+            else:
+                return z, z_skl, joint_skin_embeds, vert_skin_embeds, joints_pos
+        else:
+            # Return skin encoded features as well for checking
+            if return_raw:
+                return z, mean, logvar, z_skl, mean_skl, logvar_skl, joint_skin_embeds, vert_skin_embeds, joints_pos, x_skin, x_skl
+            else:
+                return z, z_skl, joint_skin_embeds, vert_skin_embeds, joints_pos, x_skin, x_skl
+    def encode_skin(self, joints_list: List[torch.Tensor], parents_list: List[torch.Tensor], skin_list: List[torch.Tensor]=None):
+        joint_skin_embeds, vert_skin_embeds = self.skin_encoder(joints_list, parents_list, skin_list)
+        return joint_skin_embeds, vert_skin_embeds
+class AniGenElasticSLatEncoder(SparseTransformerElasticMixin, AniGenSLatEncoder):
+    """
+    SLat VAE encoder with elastic memory management.
+    Used for training with low VRAM.
+    """

anigen/models/structured_latent_vae/base.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ...modules.utils import convert_module_to_f16, convert_module_to_f32
+from ...modules import sparse as sp
+from ...modules.transformer import AbsolutePositionEmbedder
+from ...modules.sparse.transformer import SparseTransformerBlock
+def block_attn_config(self):
+    """
+    Return the attention configuration of the model.
+    """
+    for i in range(self.num_blocks):
+        if self.attn_mode == "shift_window":
+            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_sequence":
+            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
+        elif self.attn_mode == "shift_order":
+            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
+        elif self.attn_mode == "full":
+            yield "full", None, None, None, None
+        elif self.attn_mode == "swin":
+            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
+class SparseTransformerBase(nn.Module):
+    """
+    Sparse Transformer without output layers.
+    Serve as the base class for encoder and decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        model_channels: int,
+        num_blocks: int,
+        num_heads: Optional[int] = None,
+        num_head_channels: Optional[int] = 64,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        pe_mode: Literal["ape", "rope"] = "ape",
+        use_fp16: bool = False,
+        use_checkpoint: bool = False,
+        qk_rms_norm: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.num_blocks = num_blocks
+        self.window_size = window_size
+        self.num_heads = num_heads or model_channels // num_head_channels
+        self.mlp_ratio = mlp_ratio
+        self.attn_mode = attn_mode
+        self.pe_mode = pe_mode
+        self.use_fp16 = use_fp16
+        self.use_checkpoint = use_checkpoint
+        self.qk_rms_norm = qk_rms_norm
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        if pe_mode == "ape":
+            self.pos_embedder = AbsolutePositionEmbedder(model_channels)
+        self.input_layer = sp.SparseLinear(in_channels, model_channels)
+        self.blocks = nn.ModuleList([
+            SparseTransformerBlock(
+                model_channels,
+                num_heads=self.num_heads,
+                mlp_ratio=self.mlp_ratio,
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                use_checkpoint=self.use_checkpoint,
+                use_rope=(pe_mode == "rope"),
+                qk_rms_norm=self.qk_rms_norm,
+            )
+            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
+        ])
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
+        h = self.input_layer(x)
+        if self.pe_mode == "ape":
+            h = h + self.pos_embedder(x.coords[:, 1:])
+        h = h.type(self.dtype)
+        for block in self.blocks:
+            h = block(h)
+        return h

anigen/models/structured_latent_vae/skin_models.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+import torch.nn as nn
+from typing import *
+from ..sparse_elastic_mixin import SparseTransformerElasticMixin
+from ...modules.transformer import TransformerBlock, FeedForwardNet
+from .anigen_base import FreqPositionalEmbedder, TransformerCrossBlock
+from ...modules.utils import zero_module, convert_module_to_f16, convert_module_to_f32
+class Embedder(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_dim: int=None, depth: int = 4, mlp_ratio: float = 4.0, jp_embed_attn: bool = True):
+        super().__init__()
+        hidden_dim = out_dim if hidden_dim is None else hidden_dim
+        self.jp_embed_attn = jp_embed_attn
+        self.in_layer = FeedForwardNet(channels=in_dim, out_channels=hidden_dim, mlp_ratio=mlp_ratio)
+        if self.jp_embed_attn:
+            self.blocks = nn.ModuleList([TransformerBlock(hidden_dim, num_heads=8, attn_mode='full') for _ in range(depth)])
+            for block in self.blocks:
+                block.to(torch.float16)
+        else:
+            self.blocks = nn.ModuleList([FeedForwardNet(channels=hidden_dim, out_channels=hidden_dim, mlp_ratio=mlp_ratio) for _ in range(depth)])
+        self.out_layer = nn.Linear(hidden_dim, out_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.in_layer(x)
+        h = x
+        for block in self.blocks:
+            h = block(h[None].type(torch.float16))[0] if self.jp_embed_attn else block(h) + x
+        h = self.out_layer(h.type(x.dtype))
+        return h
+class SkinEncoder(nn.Module):
+    def __init__(self, skin_feat_channels: int = 8, skl_pos_embed_freq: int = 10, jp_embedder_config: Optional[Dict[str, Any]] = {}, jp_embed_dim: int = 128, relative_pe=True, vert_feat_is_linear=True, normalize_feat=True, **kwargs):
+        super().__init__()
+        self.skin_feat_channels = skin_feat_channels
+        self.skl_pos_embed_freq = skl_pos_embed_freq
+        self.jp_embedder_config = jp_embedder_config
+        self.pos_embedder_fourier = FreqPositionalEmbedder(in_dim=3, max_freq_log2=self.skl_pos_embed_freq, num_freqs=self.skl_pos_embed_freq, include_input=True)
+        self.pos_embedder_linear = nn.Linear(self.pos_embedder_fourier.out_dim, jp_embed_dim)
+        self.root_embedding = nn.Parameter(torch.zeros(1, jp_embed_dim))
+        self.joint_embedder = Embedder(in_dim=2 * jp_embed_dim, out_dim=jp_embed_dim, **self.jp_embedder_config)
+        self.out_layer_vert = FeedForwardNet(channels=jp_embed_dim, out_channels=skin_feat_channels)
+        self.out_layer_joint = FeedForwardNet(channels=jp_embed_dim, out_channels=skin_feat_channels)
+        self.relative_pe = relative_pe
+        self.vert_feat_is_linear = vert_feat_is_linear
+        self.normalize_feat = normalize_feat
+    def forward(self, joints_list: List[torch.Tensor], parents_list: List[torch.Tensor], skin_list: List[torch.Tensor]=None):
+        vert_skin_embeds = [] if skin_list is not None else None
+        joint_skin_embeds = []
+        for i in range(len(joints_list)):
+            parent_idx = parents_list[i].clone()
+            joints = joints_list[i]
+            if self.relative_pe:
+                joints = joints - torch.cat([joints, joints[:1]])[parent_idx]
+            joints_pos_embed = self.pos_embedder_linear(self.pos_embedder_fourier(joints))
+            joints_pos_embed = torch.cat([joints_pos_embed, self.root_embedding], dim=0)
+            parents_pos_embed = joints_pos_embed[parent_idx]
+            jp_pos_embed = torch.cat([joints_pos_embed[:-1], parents_pos_embed], dim=-1)
+            joints_embed = self.joint_embedder(jp_pos_embed)
+            if self.normalize_feat:
+                joints_embed = torch.nn.functional.normalize(joints_embed, dim=-1)
+            if skin_list is not None:
+                vert_skin = skin_list[i]
+                if self.vert_feat_is_linear:
+                    joints_embed_for_vert = self.out_layer_vert(joints_embed)
+                    vert_skin_embed = vert_skin @ joints_embed_for_vert
+                else:
+                    vert_skin_embed = vert_skin @ joints_embed
+                    vert_skin_embed = self.out_layer_vert(vert_skin_embed)
+                vert_skin_embeds.append(vert_skin_embed)
+            joints_embed = self.out_layer_joint(joints_embed)
+            joint_skin_embeds.append(joints_embed)
+        return joint_skin_embeds, vert_skin_embeds
+def clamp_with_grad(x, min, max):
+    return x + (x.clamp(min, max) - x).detach()
+class TreeTransformerSkinDecoder(nn.Module):
+    # The principles of the tree transformer skinning model:
+    # 1. joint features are related to the tree structure, since the decoding process is skeleton-agnostic.
+    # 2. decode the skinning weights directly, hoping the transformer can handle the skinning assignment.
+    # It's a pure learning-based method.
+    def __init__(self,
+                 skin_feat_channels: int,
+                 model_channels: int=512,
+                 num_heads=4,
+                 num_blocks=4,
+                 vert_cross_blocks_num: int = 1,
+                 use_fp16: bool = False):
+        super().__init__()
+        self.skin_feat_channels = skin_feat_channels
+        self.model_channels = model_channels
+        self.num_heads = num_heads
+        self.num_blocks = num_blocks
+        self.root_features = nn.Parameter(torch.zeros([1, skin_feat_channels]), requires_grad=True)
+        self.input_layer_vertex = nn.Linear(skin_feat_channels, model_channels)
+        self.input_layer_skin   = nn.Linear(skin_feat_channels*2, model_channels)
+        assert vert_cross_blocks_num <= num_blocks, f"vert_cross_blocks_num should be less than or equal to num_blocks, got {vert_cross_blocks_num} and {num_blocks}."
+        self.vert_cross_blocks_num = vert_cross_blocks_num
+        self.blocks_vertex = nn.ModuleList([
+            TransformerCrossBlock(
+                channels=model_channels,
+                ctx_channels=model_channels,
+                num_heads=num_heads,
+                mlp_ratio=4.0,
+                attn_mode="full",
+                no_self=True)
+            for _ in range(self.vert_cross_blocks_num)
+        ] + [
+            FeedForwardNet(
+                channels=model_channels,
+                mlp_ratio=4.0,
+                out_channels=model_channels,
+            )
+            for _ in range(num_blocks - self.vert_cross_blocks_num)
+        ])
+        self.blocks_skin = nn.ModuleList([
+            TransformerBlock(
+                channels=model_channels,
+                num_heads=num_heads,
+                mlp_ratio=4.0,
+                attn_mode="full")
+            for _ in range(num_blocks)
+        ])
+        self.out_layer_vertex = nn.Sequential(
+            nn.Linear(model_channels, model_channels*4),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(model_channels*4, model_channels+1),
+        )
+        self.out_layer_skin = nn.Sequential(
+            nn.Linear(model_channels, model_channels*4),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(model_channels*4, model_channels),
+        )
+        self.temp_activation = nn.ELU(alpha=1.0)
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+    @property
+    def device(self) -> torch.device:
+        """
+        Return the device of the model.
+        """
+        return next(self.parameters()).device
+    def convert_to_fp16(self) -> None:
+        """
+        Convert the torso of the model to float16.
+        """
+        self.blocks_vertex.apply(convert_module_to_f16)
+        self.blocks_skin.apply(convert_module_to_f16)
+    def convert_to_fp32(self) -> None:
+        """
+        Convert the torso of the model to float32.
+        """
+        self.blocks_vertex.apply(convert_module_to_f32)
+        self.blocks_skin.apply(convert_module_to_f32)
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self, vertex_features, joint_features, parents) -> torch.Tensor:
+        j_num = joint_features.shape[1]
+        h_v = vertex_features
+        h_v =  self.input_layer_vertex(h_v)
+        h_j = joint_features
+        h_j = torch.cat([h_j, self.root_features[None]], dim=1)
+        parents = torch.where(parents < 0, torch.ones_like(parents)*j_num, parents)
+        h_j = torch.cat([h_j[:, :-1], h_j[:, parents[0]]], dim=-1)
+        h_j = self.input_layer_skin(h_j)
+        h_v = h_v.type(self.dtype)
+        h_j = h_j.type(self.dtype)
+        blocks_num = len(self.blocks_vertex)
+        for idx, block_v, block_j in zip(range(blocks_num), self.blocks_vertex, self.blocks_skin):
+            f_v, f_j = h_v, h_j
+            h_v = block_v(f_v, f_j) if idx < self.vert_cross_blocks_num else block_v(f_v)
+            h_j = block_j(f_j)
+        h_v = h_v.type(vertex_features.dtype)
+        h_j = h_j.type(joint_features.dtype)
+        h_v = self.out_layer_vertex(h_v)
+        h_j = self.out_layer_skin(h_j)
+        h_v, inv_temp = h_v[..., :-1], h_v[..., -1].unsqueeze(-1)
+        inv_temp = self.temp_activation(inv_temp) + self.temp_activation.alpha + 1.0
+        skin_weights = torch.einsum("nac,nbc->nab", h_v, h_j)
+        skin_weights = torch.softmax(skin_weights * inv_temp, dim=-1)
+        return skin_weights
+SKIN_MODEL_DICT = {'tree': TreeTransformerSkinDecoder}
+class SkinAutoEncoder(nn.Module):
+    def __init__(self, encoder_config: Dict[str, Any], decoder_config: Dict[str, Any], use_fp16: bool = False):
+        super().__init__()
+        self.skin_encoder = SkinEncoder(**encoder_config)
+        decoder_config['use_fp16'] = use_fp16
+        self.skin_decoder = SKIN_MODEL_DICT[decoder_config.pop('model_type')](**decoder_config)
+        self.initialize_weights()
+        if use_fp16:
+            self.convert_to_fp16()
+        else:
+            self.convert_to_fp32()
+    def convert_to_fp16(self) -> None:
+        self.skin_decoder.convert_to_fp16()
+    def convert_to_fp32(self) -> None:
+        self.skin_decoder.convert_to_fp32()
+    def initialize_weights(self) -> None:
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def encode(self, joints_list: List[torch.Tensor], parents_list: List[torch.Tensor], skin_list: List[torch.Tensor]):
+        joint_skin_embeds, vert_skin_embeds = self.skin_encoder(joints_list, parents_list, skin_list)
+        return joint_skin_embeds, vert_skin_embeds
+    def decode(self, vertex_features, joint_features, parents) -> torch.Tensor:
+        skin_weights = self.skin_decoder(vertex_features, joint_features, parents)
+        return skin_weights
+    def forward(self, joints_list: List[torch.Tensor], parents_list: List[torch.Tensor], skin_list: List[torch.Tensor]):
+        joint_skin_embeds, vert_skin_embeds = self.skin_encoder(joints_list, parents_list, skin_list)
+        skin_pred_list = []
+        for i in range(len(joints_list)):
+            skin_pred = self.skin_decoder(vert_skin_embeds[i][None], joint_skin_embeds[i][None], parents_list[i][None])
+            skin_pred_list.append(skin_pred[0])
+        return skin_pred_list, joint_skin_embeds, vert_skin_embeds
+class AniGenElasticSLatEncoderGamma(SparseTransformerElasticMixin, SkinAutoEncoder):
+    """
+    SLat VAE encoder with elastic memory management.
+    Used for training with low VRAM.
+    """

anigen/modules/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import *
+BACKEND = 'flash_attn'
+DEBUG = False
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    env_attn_backend = os.environ.get('ATTN_BACKEND')
+    env_sttn_debug = os.environ.get('ATTN_DEBUG')
+    if env_attn_backend is not None and env_attn_backend in ['xformers', 'flash_attn', 'sdpa', 'naive']:
+        BACKEND = env_attn_backend
+    if env_sttn_debug is not None:
+        DEBUG = env_sttn_debug == '1'
+    print(f"[ATTENTION] Using backend: {BACKEND}")
+__from_env()
+def set_backend(backend: Literal['xformers', 'flash_attn']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+from .full_attn import *
+from .modules import *

anigen/modules/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from typing import *
+import torch
+import math
+from . import DEBUG, BACKEND
+if BACKEND == 'xformers':
+    import xformers.ops as xops
+elif BACKEND == 'flash_attn':
+    import flash_attn
+elif BACKEND == 'sdpa':
+    from torch.nn.functional import scaled_dot_product_attention as sdpa
+elif BACKEND == 'naive':
+    pass
+else:
+    raise ValueError(f"Unknown attention backend: {BACKEND}")
+__all__ = [
+    'scaled_dot_product_attention',
+]
+def _naive_sdpa(q, k, v):
+    """
+    Naive implementation of scaled dot product attention.
+    """
+    q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+    k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+    v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+    scale_factor = 1 / math.sqrt(q.size(-1))
+    attn_weight = q @ k.transpose(-2, -1) * scale_factor
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    out = attn_weight @ v
+    out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    return out
+@overload
+def scaled_dot_product_attention(qkv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        qkv (torch.Tensor): A [N, L, 3, H, C] tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, kv: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, C] tensor containing Qs.
+        kv (torch.Tensor): A [N, L, 2, H, C] tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def scaled_dot_product_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+def scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert len(qkv.shape) == 5 and qkv.shape[2] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, L, 3, H, C]"
+        device = qkv.device
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+        assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+        device = q.device
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+        assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+        assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+        device = q.device
+    if BACKEND == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = xops.memory_efficient_attention(q, k, v)
+    elif BACKEND == 'flash_attn':
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_qkvpacked_func(qkv)
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_kvpacked_func(q, kv)
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_func(q, k, v)
+    elif BACKEND == 'sdpa':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        q = q.permute(0, 2, 1, 3)   # [N, H, L, C]
+        k = k.permute(0, 2, 1, 3)   # [N, H, L, C]
+        v = v.permute(0, 2, 1, 3)   # [N, H, L, C]
+        out = sdpa(q, k, v)         # [N, H, L, C]
+        out = out.permute(0, 2, 1, 3)   # [N, L, H, C]
+    elif BACKEND == 'naive':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=2)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=2)
+        out = _naive_sdpa(q, k, v)
+    else:
+        raise ValueError(f"Unknown attention module: {BACKEND}")
+    return out

anigen/modules/attention/modules.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .full_attn import scaled_dot_product_attention
+class MultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return (F.normalize(x.float(), dim = -1) * self.gamma * self.scale).to(x.dtype)
+class RotaryPositionEmbedder(nn.Module):
+    def __init__(self, hidden_size: int, in_channels: int = 3):
+        super().__init__()
+        assert hidden_size % 2 == 0, "Hidden size must be divisible by 2"
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.freq_dim = hidden_size // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _get_phases(self, indices: torch.Tensor) -> torch.Tensor:
+        self.freqs = self.freqs.to(indices.device)
+        phases = torch.outer(indices, self.freqs)
+        phases = torch.polar(torch.ones_like(phases), phases)
+        return phases
+    def _rotary_embedding(self, x: torch.Tensor, phases: torch.Tensor) -> torch.Tensor:
+        x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        x_rotated = x_complex * phases
+        x_embed = torch.view_as_real(x_rotated).reshape(*x_rotated.shape[:-1], -1).to(x.dtype)
+        return x_embed
+    def forward(self, q: torch.Tensor, k: torch.Tensor, indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            q (sp.SparseTensor): [..., N, D] tensor of queries
+            k (sp.SparseTensor): [..., N, D] tensor of keys
+            indices (torch.Tensor): [..., N, C] tensor of spatial positions
+        """
+        if indices is None:
+            indices = torch.arange(q.shape[-2], device=q.device)
+            if len(q.shape) > 2:
+                indices = indices.unsqueeze(0).expand(q.shape[:-2] + (-1,))
+        phases = self._get_phases(indices.reshape(-1)).reshape(*indices.shape[:-1], -1)
+        if phases.shape[1] < self.hidden_size // 2:
+            phases = torch.cat([phases, torch.polar(
+                torch.ones(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device),
+                torch.zeros(*phases.shape[:-1], self.hidden_size // 2 - phases.shape[1], device=phases.device)
+            )], dim=-1)
+        q_embed = self._rotary_embedding(q, phases)
+        k_embed = self._rotary_embedding(k, phases)
+        return q_embed, k_embed
+class LoRALinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, rank: int = 4, lr_rate: float = 1.0):
+        super().__init__(in_features, out_features, bias)
+        self.rank = rank
+        self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
+        self.lora_B = nn.Parameter(torch.randn(rank, out_features) * 1e-2)
+        self.lr_rate = lr_rate
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x) + (x @ self.lora_A) @ self.lora_B * self.lr_rate
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int]=None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        x_is_query: bool = False,
+        use_lora: bool = False,
+        lora_rank: int = 4,
+        lora_lr_rate: float = 1.0,
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or attn_mode == "full", "Cross-attention only supports full attention"
+        if attn_mode == "windowed":
+            raise NotImplementedError("Windowed attention is not yet implemented")
+        self.channels = channels
+        self.head_dim = channels // num_heads
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_window = shift_window
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias) if not use_lora else LoRALinear(channels, channels * 3, bias=qkv_bias, rank=lora_rank, lr_rate=lora_lr_rate)
+        else:
+            self.to_q = (lambda x: x) if x_is_query else (nn.Linear(channels, channels, bias=qkv_bias) if not use_lora else LoRALinear(channels, channels, bias=qkv_bias, rank=lora_rank, lr_rate=lora_lr_rate))
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias) if not use_lora else LoRALinear(self.ctx_channels, channels * 2, bias=qkv_bias, rank=lora_rank, lr_rate=lora_lr_rate)
+        if self.qk_rms_norm:
+            self.q_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+            self.k_rms_norm = MultiHeadRMSNorm(self.head_dim, num_heads)
+        self.to_out = nn.Linear(channels, channels) if not use_lora else LoRALinear(channels, channels, rank=lora_rank, lr_rate=lora_lr_rate)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    def forward(self, x: torch.Tensor, context: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None) -> torch.Tensor:
+        B, L, C = x.shape
+        if self._type == "self":
+            qkv = self.to_qkv(x)
+            qkv = qkv.reshape(B, L, 3, self.num_heads, -1)
+            if self.use_rope:
+                q, k, v = qkv.unbind(dim=2)
+                q, k = self.rope(q, k, indices)
+                qkv = torch.stack([q, k, v], dim=2)
+            if self.attn_mode == "full":
+                if self.qk_rms_norm:
+                    q, k, v = qkv.unbind(dim=2)
+                    q = self.q_rms_norm(q)
+                    k = self.k_rms_norm(k)
+                    h = scaled_dot_product_attention(q, k, v)
+                else:
+                    h = scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "windowed":
+                raise NotImplementedError("Windowed attention is not yet implemented")
+        else:
+            Lkv = context.shape[1]
+            q = self.to_q(x)
+            kv = self.to_kv(context)
+            q = q.reshape(B, L, self.num_heads, -1)
+            kv = kv.reshape(B, Lkv, 2, self.num_heads, -1)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=2)
+                k = self.k_rms_norm(k)
+                h = scaled_dot_product_attention(q, k, v)
+            else:
+                h = scaled_dot_product_attention(q, kv)
+        h = h.reshape(B, L, -1)
+        h = self.to_out(h)
+        return h

anigen/modules/norm.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+class LayerNorm32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class GroupNorm32(nn.GroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class ChannelLayerNorm32(LayerNorm32):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        DIM = x.dim()
+        x = x.permute(0, *range(2, DIM), 1).contiguous()
+        x = super().forward(x)
+        x = x.permute(0, DIM-1, *range(1, DIM-1)).contiguous()
+        return x

anigen/modules/sparse/__init__.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import *
+BACKEND = 'spconv'
+DEBUG = False
+ATTN = 'flash_attn'
+def __from_env():
+    import os
+    global BACKEND
+    global DEBUG
+    global ATTN
+    env_sparse_backend = os.environ.get('SPARSE_BACKEND')
+    env_sparse_debug = os.environ.get('SPARSE_DEBUG')
+    env_sparse_attn = os.environ.get('SPARSE_ATTN_BACKEND')
+    if env_sparse_attn is None:
+        env_sparse_attn = os.environ.get('ATTN_BACKEND')
+    if env_sparse_backend is not None and env_sparse_backend in ['spconv', 'torchsparse']:
+        BACKEND = env_sparse_backend
+    if env_sparse_debug is not None:
+        DEBUG = env_sparse_debug == '1'
+    if env_sparse_attn is not None and env_sparse_attn in ['xformers', 'flash_attn']:
+        ATTN = env_sparse_attn
+    print(f"[SPARSE] Backend: {BACKEND}, Attention: {ATTN}")
+__from_env()
+def set_backend(backend: Literal['spconv', 'torchsparse']):
+    global BACKEND
+    BACKEND = backend
+def set_debug(debug: bool):
+    global DEBUG
+    DEBUG = debug
+def set_attn(attn: Literal['xformers', 'flash_attn']):
+    global ATTN
+    ATTN = attn
+import importlib
+__attributes = {
+    'SparseTensor': 'basic',
+    'sparse_batch_broadcast': 'basic',
+    'sparse_batch_op': 'basic',
+    'sparse_cat': 'basic',
+    'sparse_unbind': 'basic',
+    'SparseGroupNorm': 'norm',
+    'SparseLayerNorm': 'norm',
+    'SparseGroupNorm32': 'norm',
+    'SparseLayerNorm32': 'norm',
+    'SparseReLU': 'nonlinearity',
+    'SparseSiLU': 'nonlinearity',
+    'SparseGELU': 'nonlinearity',
+    'SparseActivation': 'nonlinearity',
+    'SparseLinear': 'linear',
+    'sparse_scaled_dot_product_attention': 'attention',
+    'SerializeMode': 'attention',
+    'sparse_serialized_scaled_dot_product_self_attention': 'attention',
+    'sparse_windowed_scaled_dot_product_self_attention': 'attention',
+    'SparseMultiHeadAttention': 'attention',
+    'SparseConv3d': 'conv',
+    'SparseInverseConv3d': 'conv',
+    'SparseDownsample': 'spatial',
+    'SparseUpsample': 'spatial',
+    'SparseSubdivide' : 'spatial'
+}
+__submodules = ['transformer']
+__all__ = list(__attributes.keys()) + __submodules
+def __getattr__(name):
+    if name not in globals():
+        if name in __attributes:
+            module_name = __attributes[name]
+            module = importlib.import_module(f".{module_name}", __name__)
+            globals()[name] = getattr(module, name)
+        elif name in __submodules:
+            module = importlib.import_module(f".{name}", __name__)
+            globals()[name] = module
+        else:
+            raise AttributeError(f"module {__name__} has no attribute {name}")
+    return globals()[name]
+# For Pylance
+if __name__ == '__main__':
+    from .basic import *
+    from .norm import *
+    from .nonlinearity import *
+    from .linear import *
+    from .attention import *
+    from .conv import *
+    from .spatial import *
+    import transformer

anigen/modules/sparse/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .full_attn import *
+from .serialized_attn import *
+from .windowed_attn import *
+from .modules import *
+from .windowed_attn_cross import *

anigen/modules/sparse/attention/full_attn.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from typing import *
+import torch
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_scaled_dot_product_attention',
+]
+@overload
+def sparse_scaled_dot_product_attention(qkv: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): A [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, kv: Union[SparseTensor, torch.Tensor]) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, C] sparse tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor or a [N, L, 2, H, C] dense tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, kv: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, L, H, C] dense tensor containing Qs.
+        kv (SparseTensor or torch.Tensor): A [N, *, 2, H, C] sparse tensor containing Ks and Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: SparseTensor, v: SparseTensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    Note:
+        k and v are assumed to have the same coordinate map.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: SparseTensor, k: torch.Tensor, v: torch.Tensor) -> SparseTensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (SparseTensor): A [N, *, H, Ci] sparse tensor containing Qs.
+        k (torch.Tensor): A [N, L, H, Ci] dense tensor containing Ks.
+        v (torch.Tensor): A [N, L, H, Co] dense tensor containing Vs.
+    """
+    ...
+@overload
+def sparse_scaled_dot_product_attention(q: torch.Tensor, k: SparseTensor, v: SparseTensor) -> torch.Tensor:
+    """
+    Apply scaled dot product attention to a sparse tensor.
+    Args:
+        q (torch.Tensor): A [N, L, H, Ci] dense tensor containing Qs.
+        k (SparseTensor): A [N, *, H, Ci] sparse tensor containing Ks.
+        v (SparseTensor): A [N, *, H, Co] sparse tensor containing Vs.
+    """
+    ...
+def sparse_scaled_dot_product_attention(*args, **kwargs):
+    arg_names_dict = {
+        1: ['qkv'],
+        2: ['q', 'kv'],
+        3: ['q', 'k', 'v']
+    }
+    num_all_args = len(args) + len(kwargs)
+    assert num_all_args in arg_names_dict, f"Invalid number of arguments, got {num_all_args}, expected 1, 2, or 3"
+    for key in arg_names_dict[num_all_args][len(args):]:
+        assert key in kwargs, f"Missing argument {key}"
+    if num_all_args == 1:
+        qkv = args[0] if len(args) > 0 else kwargs['qkv']
+        assert isinstance(qkv, SparseTensor), f"qkv must be a SparseTensor, got {type(qkv)}"
+        assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+        device = qkv.device
+        s = qkv
+        q_seqlen = [qkv.layout[i].stop - qkv.layout[i].start for i in range(qkv.shape[0])]
+        kv_seqlen = q_seqlen
+        qkv = qkv.feats     # [T, 3, H, C]
+    elif num_all_args == 2:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        kv = args[1] if len(args) > 1 else kwargs['kv']
+        assert isinstance(q, SparseTensor) and isinstance(kv, (SparseTensor, torch.Tensor)) or \
+               isinstance(q, torch.Tensor) and isinstance(kv, SparseTensor), \
+               f"Invalid types, got {type(q)} and {type(kv)}"
+        assert q.shape[0] == kv.shape[0], f"Batch size mismatch, got {q.shape[0]} and {kv.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, C]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, C]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, C]"
+            s = None
+            N, L, H, C = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, C)   # [T_Q, H, C]
+        if isinstance(kv, SparseTensor):
+            assert len(kv.shape) == 4 and kv.shape[1] == 2, f"Invalid shape for kv, got {kv.shape}, expected [N, *, 2, H, C]"
+            kv_seqlen = [kv.layout[i].stop - kv.layout[i].start for i in range(kv.shape[0])]
+            kv = kv.feats     # [T_KV, 2, H, C]
+        else:
+            assert len(kv.shape) == 5, f"Invalid shape for kv, got {kv.shape}, expected [N, L, 2, H, C]"
+            N, L, _, H, C = kv.shape
+            kv_seqlen = [L] * N
+            kv = kv.reshape(N * L, 2, H, C)   # [T_KV, 2, H, C]
+    elif num_all_args == 3:
+        q = args[0] if len(args) > 0 else kwargs['q']
+        k = args[1] if len(args) > 1 else kwargs['k']
+        v = args[2] if len(args) > 2 else kwargs['v']
+        assert isinstance(q, SparseTensor) and isinstance(k, (SparseTensor, torch.Tensor)) and type(k) == type(v) or \
+               isinstance(q, torch.Tensor) and isinstance(k, SparseTensor) and isinstance(v, SparseTensor), \
+               f"Invalid types, got {type(q)}, {type(k)}, and {type(v)}"
+        assert q.shape[0] == k.shape[0] == v.shape[0], f"Batch size mismatch, got {q.shape[0]}, {k.shape[0]}, and {v.shape[0]}"
+        device = q.device
+        if isinstance(q, SparseTensor):
+            assert len(q.shape) == 3, f"Invalid shape for q, got {q.shape}, expected [N, *, H, Ci]"
+            s = q
+            q_seqlen = [q.layout[i].stop - q.layout[i].start for i in range(q.shape[0])]
+            q = q.feats     # [T_Q, H, Ci]
+        else:
+            assert len(q.shape) == 4, f"Invalid shape for q, got {q.shape}, expected [N, L, H, Ci]"
+            s = None
+            N, L, H, CI = q.shape
+            q_seqlen = [L] * N
+            q = q.reshape(N * L, H, CI)  # [T_Q, H, Ci]
+        if isinstance(k, SparseTensor):
+            assert len(k.shape) == 3, f"Invalid shape for k, got {k.shape}, expected [N, *, H, Ci]"
+            assert len(v.shape) == 3, f"Invalid shape for v, got {v.shape}, expected [N, *, H, Co]"
+            kv_seqlen = [k.layout[i].stop - k.layout[i].start for i in range(k.shape[0])]
+            k = k.feats     # [T_KV, H, Ci]
+            v = v.feats     # [T_KV, H, Co]
+        else:
+            assert len(k.shape) == 4, f"Invalid shape for k, got {k.shape}, expected [N, L, H, Ci]"
+            assert len(v.shape) == 4, f"Invalid shape for v, got {v.shape}, expected [N, L, H, Co]"
+            N, L, H, CI, CO = *k.shape, v.shape[-1]
+            kv_seqlen = [L] * N
+            k = k.reshape(N * L, H, CI)     # [T_KV, H, Ci]
+            v = v.reshape(N * L, H, CO)     # [T_KV, H, Co]
+    if DEBUG:
+        if s is not None:
+            for i in range(s.shape[0]):
+                assert (s.coords[s.layout[i]] == i).all(), f"SparseScaledDotProductSelfAttention: batch index mismatch"
+        if num_all_args in [2, 3]:
+            assert q.shape[:2] == [1, sum(q_seqlen)], f"SparseScaledDotProductSelfAttention: q shape mismatch"
+        if num_all_args == 3:
+            assert k.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: k shape mismatch"
+            assert v.shape[:2] == [1, sum(kv_seqlen)], f"SparseScaledDotProductSelfAttention: v shape mismatch"
+    if ATTN == 'xformers':
+        if num_all_args == 1:
+            q, k, v = qkv.unbind(dim=1)
+        elif num_all_args == 2:
+            k, v = kv.unbind(dim=1)
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seqlen, kv_seqlen)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0]
+    elif ATTN == 'flash_attn':
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seqlen), dim=0)]).int().to(device)
+        if num_all_args in [2, 3]:
+            cu_seqlens_kv = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seqlen), dim=0)]).int().to(device)
+        if num_all_args == 1:
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens_q, max(q_seqlen))
+        elif num_all_args == 2:
+            out = flash_attn.flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+        elif num_all_args == 3:
+            out = flash_attn.flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_kv, max(q_seqlen), max(kv_seqlen))
+    else:
+        raise ValueError(f"Unknown attention module: {ATTN}")
+    if s is not None:
+        return s.replace(out)
+    else:
+        return out.reshape(N, L, H, -1)

anigen/modules/sparse/attention/modules.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .. import SparseTensor
+from .full_attn import sparse_scaled_dot_product_attention
+from .serialized_attn import SerializeMode, sparse_serialized_scaled_dot_product_self_attention
+from .windowed_attn import sparse_windowed_scaled_dot_product_self_attention
+from .windowed_attn_cross import sparse_windowed_scaled_dot_product_cross_attention
+from ...attention import RotaryPositionEmbedder
+class SparseMultiHeadRMSNorm(nn.Module):
+    def __init__(self, dim: int, heads: int):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+    def forward(self, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        x_type = x.dtype
+        x = x.float()
+        if isinstance(x, SparseTensor):
+            x = x.replace(F.normalize(x.feats, dim=-1))
+        else:
+            x = F.normalize(x, dim=-1)
+        return (x * self.gamma * self.scale).to(x_type)
+class SparseMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        ctx_channels: Optional[int] = None,
+        type: Literal["self", "cross"] = "self",
+        attn_mode: Literal["full", "serialized", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        qkv_bias: bool = True,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        cross_attn_cache_suffix: str = '',
+    ):
+        super().__init__()
+        assert channels % num_heads == 0
+        assert type in ["self", "cross"], f"Invalid attention type: {type}"
+        assert attn_mode in ["full", "serialized", "windowed"], f"Invalid attention mode: {attn_mode}"
+        assert type == "self" or (attn_mode == "full" or attn_mode == "windowed"), "Cross-attention only supports full and windowed attention"
+        assert type == "self" or use_rope is False, "Rotary position embeddings only supported for self-attention"
+        self.channels = channels
+        self.ctx_channels = ctx_channels if ctx_channels is not None else channels
+        self.num_heads = num_heads
+        self._type = type
+        self.attn_mode = attn_mode
+        self.window_size = window_size
+        self.shift_sequence = shift_sequence
+        self.shift_window = shift_window
+        self.serialize_mode = serialize_mode
+        self.use_rope = use_rope
+        self.qk_rms_norm = qk_rms_norm
+        if self._type == "self":
+            self.to_qkv = nn.Linear(channels, channels * 3, bias=qkv_bias)
+        else:
+            self.to_q = nn.Linear(channels, channels, bias=qkv_bias)
+            self.to_kv = nn.Linear(self.ctx_channels, channels * 2, bias=qkv_bias)
+            self.cross_attn_cache_suffix = cross_attn_cache_suffix
+        if self.qk_rms_norm:
+            self.q_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+            self.k_rms_norm = SparseMultiHeadRMSNorm(channels // num_heads, num_heads)
+        self.to_out = nn.Linear(channels, channels)
+        if use_rope:
+            self.rope = RotaryPositionEmbedder(channels)
+    @staticmethod
+    def _linear(module: nn.Linear, x: Union[SparseTensor, torch.Tensor]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.replace(module(x.feats))
+        else:
+            return module(x)
+    @staticmethod
+    def _reshape_chs(x: Union[SparseTensor, torch.Tensor], shape: Tuple[int, ...]) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            return x.reshape(*shape)
+        else:
+            return x.reshape(*x.shape[:2], *shape)
+    def _fused_pre(self, x: Union[SparseTensor, torch.Tensor], num_fused: int) -> Union[SparseTensor, torch.Tensor]:
+        if isinstance(x, SparseTensor):
+            x_feats = x.feats.unsqueeze(0)
+        else:
+            x_feats = x
+        x_feats = x_feats.reshape(*x_feats.shape[:2], num_fused, self.num_heads, -1)
+        return x.replace(x_feats.squeeze(0)) if isinstance(x, SparseTensor) else x_feats
+    def _rope(self, qkv: SparseTensor) -> SparseTensor:
+        q, k, v = qkv.feats.unbind(dim=1)   # [T, H, C]
+        q, k = self.rope(q, k, qkv.coords[:, 1:])
+        qkv = qkv.replace(torch.stack([q, k, v], dim=1))
+        return qkv
+    def forward(self, x: Union[SparseTensor, torch.Tensor], context: Optional[Union[SparseTensor, torch.Tensor]] = None) -> Union[SparseTensor, torch.Tensor]:
+        if self._type == "self":
+            qkv = self._linear(self.to_qkv, x)
+            qkv = self._fused_pre(qkv, num_fused=3)
+            if self.use_rope:
+                qkv = self._rope(qkv)
+            if self.qk_rms_norm:
+                q, k, v = qkv.unbind(dim=1)
+                q = self.q_rms_norm(q)
+                k = self.k_rms_norm(k)
+                qkv = qkv.replace(torch.stack([q.feats, k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(qkv)
+            elif self.attn_mode == "serialized":
+                h = sparse_serialized_scaled_dot_product_self_attention(
+                    qkv, self.window_size, serialize_mode=self.serialize_mode, shift_sequence=self.shift_sequence, shift_window=self.shift_window
+                )
+            elif self.attn_mode == "windowed":
+                h = sparse_windowed_scaled_dot_product_self_attention(
+                    qkv, self.window_size, shift_window=self.shift_window
+                )
+        else:
+            q = self._linear(self.to_q, x)
+            q = self._reshape_chs(q, (self.num_heads, -1))
+            kv = self._linear(self.to_kv, context)
+            kv = self._fused_pre(kv, num_fused=2)
+            if self.qk_rms_norm:
+                q = self.q_rms_norm(q)
+                k, v = kv.unbind(dim=1)
+                k = self.k_rms_norm(k)
+                kv = kv.replace(torch.stack([k.feats, v.feats], dim=1))
+            if self.attn_mode == "full":
+                h = sparse_scaled_dot_product_attention(q, kv)
+            elif self.attn_mode == "windowed":
+                q = self._fused_pre(q, num_fused=1)
+                h = sparse_windowed_scaled_dot_product_cross_attention(
+                    q, kv, self.window_size, shift_window=self.shift_window,
+                    cache_suffix=self.cross_attn_cache_suffix
+                )
+            elif self.attn_mode == "serialized":
+                raise NotImplementedError("Serialized attention is not supported for cross-attention")
+        h = self._reshape_chs(h, (-1,))
+        h = self._linear(self.to_out, h)
+        return h

anigen/modules/sparse/attention/serialized_attn.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from typing import *
+from enum import Enum
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_serialized_scaled_dot_product_self_attention',
+]
+class SerializeMode(Enum):
+    Z_ORDER = 0
+    Z_ORDER_TRANSPOSED = 1
+    HILBERT = 2
+    HILBERT_TRANSPOSED = 3
+SerializeModes = [
+    SerializeMode.Z_ORDER,
+    SerializeMode.Z_ORDER_TRANSPOSED,
+    SerializeMode.HILBERT,
+    SerializeMode.HILBERT_TRANSPOSED
+]
+def calc_serialization(
+    tensor: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> Tuple[torch.Tensor, torch.Tensor, List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor, torch.Tensor): Forwards and backwards indices.
+    """
+    fwd_indices = []
+    bwd_indices = []
+    seq_lens = []
+    seq_batch_indices = []
+    offsets = [0]
+    if 'vox2seq' not in globals():
+        import vox2seq
+    # Serialize the input
+    serialize_coords = tensor.coords[:, 1:].clone()
+    serialize_coords += torch.tensor(shift_window, dtype=torch.int32, device=tensor.device).reshape(1, 3)
+    if serialize_mode == SerializeMode.Z_ORDER:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.Z_ORDER_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='z_order', permute=[1, 0, 2])
+    elif serialize_mode == SerializeMode.HILBERT:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[0, 1, 2])
+    elif serialize_mode == SerializeMode.HILBERT_TRANSPOSED:
+        code = vox2seq.encode(serialize_coords, mode='hilbert', permute=[1, 0, 2])
+    else:
+        raise ValueError(f"Unknown serialize mode: {serialize_mode}")
+    for bi, s in enumerate(tensor.layout):
+        num_points = s.stop - s.start
+        num_windows = (num_points + window_size - 1) // window_size
+        valid_window_size = num_points / num_windows
+        to_ordered = torch.argsort(code[s.start:s.stop])
+        if num_windows == 1:
+            fwd_indices.append(to_ordered)
+            bwd_indices.append(torch.zeros_like(to_ordered).scatter_(0, to_ordered, torch.arange(num_points, device=tensor.device)))
+            fwd_indices[-1] += s.start
+            bwd_indices[-1] += offsets[-1]
+            seq_lens.append(num_points)
+            seq_batch_indices.append(bi)
+            offsets.append(offsets[-1] + seq_lens[-1])
+        else:
+            # Partition the input
+            offset = 0
+            mids = [(i + 0.5) * valid_window_size + shift_sequence for i in range(num_windows)]
+            split = [math.floor(i * valid_window_size + shift_sequence) for i in range(num_windows + 1)]
+            bwd_index = torch.zeros((num_points,), dtype=torch.int64, device=tensor.device)
+            for i in range(num_windows):
+                mid = mids[i]
+                valid_start = split[i]
+                valid_end = split[i + 1]
+                padded_start = math.floor(mid - 0.5 * window_size)
+                padded_end = padded_start + window_size
+                fwd_indices.append(to_ordered[torch.arange(padded_start, padded_end, device=tensor.device) % num_points])
+                offset += valid_start - padded_start
+                bwd_index.scatter_(0, fwd_indices[-1][valid_start-padded_start:valid_end-padded_start], torch.arange(offset, offset + valid_end - valid_start, device=tensor.device))
+                offset += padded_end - valid_start
+                fwd_indices[-1] += s.start
+            seq_lens.extend([window_size] * num_windows)
+            seq_batch_indices.extend([bi] * num_windows)
+            bwd_indices.append(bwd_index + offsets[-1])
+            offsets.append(offsets[-1] + num_windows * window_size)
+    fwd_indices = torch.cat(fwd_indices)
+    bwd_indices = torch.cat(bwd_indices)
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_serialized_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    serialize_mode: SerializeMode = SerializeMode.Z_ORDER,
+    shift_sequence: int = 0,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply serialized scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        serialize_mode (SerializeMode): The serialization mode to use.
+        shift_sequence (int): The shift of serialized sequence.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'serialization_{serialize_mode}_{window_size}_{shift_sequence}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_serialization(qkv, window_size, serialize_mode, shift_sequence, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            assert (qkv_coords[start:start+seq_lens[i], 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

anigen/modules/sparse/attention/windowed_attn.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import *
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_windowed_scaled_dot_product_self_attention',
+]
+def calc_window_partition(
+    tensor: SparseTensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, ...]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor): Forwards indices.
+        (torch.Tensor): Backwards indices.
+        (List[int]): Sequence lengths.
+        (List[int]): Sequence batch indices.
+    """
+    DIM = tensor.coords.shape[1] - 1
+    shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+    window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+    shifted_coords = tensor.coords.clone().detach()
+    shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    MAX_COORDS = shifted_coords[:, 1:].max(dim=0).values.tolist()
+    NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+    OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+    shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+    shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+    fwd_indices = torch.argsort(shifted_indices)
+    bwd_indices = torch.empty_like(fwd_indices)
+    bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+    seq_lens = torch.bincount(shifted_indices)
+    seq_batch_indices = torch.arange(seq_lens.shape[0], device=tensor.device, dtype=torch.int32) // OFFSET[0]
+    mask = seq_lens != 0
+    seq_lens = seq_lens[mask].tolist()
+    seq_batch_indices = seq_batch_indices[mask].tolist()
+    return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+def sparse_windowed_scaled_dot_product_self_attention(
+    qkv: SparseTensor,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0)
+) -> SparseTensor:
+    """
+    Apply windowed scaled dot product self attention to a sparse tensor.
+    Args:
+        qkv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(qkv.shape) == 4 and qkv.shape[1] == 3, f"Invalid shape for qkv, got {qkv.shape}, expected [N, *, 3, H, C]"
+    serialization_spatial_cache_name = f'window_partition_{window_size}_{shift_window}'
+    serialization_spatial_cache = qkv.get_spatial_cache(serialization_spatial_cache_name)
+    if serialization_spatial_cache is None:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_window_partition(qkv, window_size, shift_window)
+        qkv.register_spatial_cache(serialization_spatial_cache_name, (fwd_indices, bwd_indices, seq_lens, seq_batch_indices))
+    else:
+        fwd_indices, bwd_indices, seq_lens, seq_batch_indices = serialization_spatial_cache
+    M = fwd_indices.shape[0]
+    T = qkv.feats.shape[0]
+    H = qkv.feats.shape[2]
+    C = qkv.feats.shape[3]
+    qkv_feats = qkv.feats[fwd_indices]      # [M, 3, H, C]
+    if DEBUG:
+        start = 0
+        qkv_coords = qkv.coords[fwd_indices]
+        for i in range(len(seq_lens)):
+            seq_coords = qkv_coords[start:start+seq_lens[i]]
+            assert (seq_coords[:, 0] == seq_batch_indices[i]).all(), f"SparseWindowedScaledDotProductSelfAttention: batch index mismatch"
+            assert (seq_coords[:, 1:].max(dim=0).values - seq_coords[:, 1:].min(dim=0).values < window_size).all(), \
+                    f"SparseWindowedScaledDotProductSelfAttention: window size exceeded"
+            start += seq_lens[i]
+    if all([seq_len == window_size for seq_len in seq_lens]):
+        B = len(seq_lens)
+        N = window_size
+        qkv_feats = qkv_feats.reshape(B, N, 3, H, C)
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=2)                       # [B, N, H, C]
+            out = xops.memory_efficient_attention(q, k, v)          # [B, N, H, C]
+        elif ATTN == 'flash_attn':
+            out = flash_attn.flash_attn_qkvpacked_func(qkv_feats)   # [B, N, H, C]
+        else:
+            raise ValueError(f"Unknown attention module: {ATTN}")
+        out = out.reshape(B * N, H, C)                              # [M, H, C]
+    else:
+        if ATTN == 'xformers':
+            q, k, v = qkv_feats.unbind(dim=1)                       # [M, H, C]
+            q = q.unsqueeze(0)                                      # [1, M, H, C]
+            k = k.unsqueeze(0)                                      # [1, M, H, C]
+            v = v.unsqueeze(0)                                      # [1, M, H, C]
+            mask = xops.fmha.BlockDiagonalMask.from_seqlens(seq_lens)
+            out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+        elif ATTN == 'flash_attn':
+            cu_seqlens = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(seq_lens), dim=0)], dim=0) \
+                        .to(qkv.device).int()
+            out = flash_attn.flash_attn_varlen_qkvpacked_func(qkv_feats, cu_seqlens, max(seq_lens)) # [M, H, C]
+    out = out[bwd_indices]      # [T, H, C]
+    if DEBUG:
+        qkv_coords = qkv_coords[bwd_indices]
+        assert torch.equal(qkv_coords, qkv.coords), "SparseWindowedScaledDotProductSelfAttention: coordinate mismatch"
+    return qkv.replace(out)

anigen/modules/sparse/attention/windowed_attn_cross.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from typing import *
+import torch
+import math
+from .. import SparseTensor
+from .. import DEBUG, ATTN
+if ATTN == 'xformers':
+    import xformers.ops as xops
+elif ATTN == 'flash_attn':
+    import flash_attn
+else:
+    raise ValueError(f"Unknown attention module: {ATTN}")
+__all__ = [
+    'sparse_windowed_scaled_dot_product_cross_attention',
+]
+def calc_window_partition_cross(
+    tensor: SparseTensor,
+    context: SparseTensor,
+    window_size: Union[int, Tuple[int, ...]],
+    shift_window: Union[int, Tuple[int, ...]] = 0
+) -> Tuple[torch.Tensor, torch.Tensor, List[int], List[int]]:
+    """
+    Calculate serialization and partitioning for a set of coordinates.
+    Args:
+        tensor (SparseTensor): The input tensor.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, ...]): The shift of serialized coordinates.
+    Returns:
+        (torch.Tensor): Forwards indices.
+        (torch.Tensor): Backwards indices.
+        (List[int]): Sequence lengths.
+        (List[int]): Sequence batch indices.
+    """
+    def calc_window_partition_(tensor, window_size, shift_window):
+        DIM = tensor.coords.shape[1] - 1
+        shift_window = (shift_window,) * DIM if isinstance(shift_window, int) else shift_window
+        window_size = (window_size,) * DIM if isinstance(window_size, int) else window_size
+        shifted_coords = tensor.coords.clone().detach()
+        shifted_coords[:, 1:] += torch.tensor(shift_window, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+        MAX_COORDS = shifted_coords[:, 1:].max(dim=0).values.tolist()
+        NUM_WINDOWS = [math.ceil((mc + 1) / ws) for mc, ws in zip(MAX_COORDS, window_size)]
+        OFFSET = torch.cumprod(torch.tensor([1] + NUM_WINDOWS[::-1]), dim=0).tolist()[::-1]
+        shifted_coords[:, 1:] //= torch.tensor(window_size, device=tensor.device, dtype=torch.int32).unsqueeze(0)
+        shifted_indices = (shifted_coords * torch.tensor(OFFSET, device=tensor.device, dtype=torch.int32).unsqueeze(0)).sum(dim=1)
+        fwd_indices = torch.argsort(shifted_indices)
+        bwd_indices = torch.empty_like(fwd_indices)
+        bwd_indices[fwd_indices] = torch.arange(fwd_indices.shape[0], device=tensor.device)
+        seq_lens = torch.bincount(shifted_indices)
+        seq_batch_indices = torch.arange(seq_lens.shape[0], device=tensor.device, dtype=torch.int32) // OFFSET[0]
+        return fwd_indices, bwd_indices, seq_lens, seq_batch_indices
+    fwd_indices, bwd_indices, seq_lens, seq_batch_indices = calc_window_partition_(tensor, window_size, shift_window)
+    fwd_indices_context, bwd_indices_context, seq_lens_context, seq_batch_indices_context = calc_window_partition_(context, window_size, shift_window)
+    # Pad the shorter one to the shape of the other with 0 tail
+    max_len = max(seq_lens.shape[0], seq_lens_context.shape[0])
+    if seq_lens.shape[0] < max_len:
+        pad_size = max_len - seq_lens.shape[0]
+        seq_lens = torch.cat([seq_lens, torch.zeros(pad_size, dtype=seq_lens.dtype, device=seq_lens.device)])
+    if seq_lens_context.shape[0] < max_len:
+        pad_size = max_len - seq_lens_context.shape[0]
+        seq_lens_context = torch.cat([seq_lens_context, torch.zeros(pad_size, dtype=seq_lens_context.dtype, device=seq_lens_context.device)])
+    mask = (seq_lens != 0) | (seq_lens_context != 0)
+    seq_lens = seq_lens[mask].tolist()
+    seq_lens_context = seq_lens_context[mask].tolist()
+    return fwd_indices, bwd_indices, seq_lens, fwd_indices_context, bwd_indices_context, seq_lens_context
+def sparse_windowed_scaled_dot_product_cross_attention(
+    q: SparseTensor,
+    kv: SparseTensor,
+    window_size: int,
+    shift_window: Tuple[int, int, int] = (0, 0, 0),
+    cache_suffix: str = '',
+) -> SparseTensor:
+    """
+    Apply windowed scaled dot product cross attention to a sparse tensor.
+    Args:
+        q, kv (SparseTensor): [N, *, 3, H, C] sparse tensor containing Qs, Ks, and Vs.
+        window_size (int): The window size to use.
+        shift_window (Tuple[int, int, int]): The shift of serialized coordinates.
+        shift (int): The shift to use.
+    """
+    assert len(q.shape)  == 4 and q.shape[1]  == 1, f"Invalid shape for q, got {q.shape}, expected [N, *, 1, H, C]"
+    assert len(kv.shape) == 4 and kv.shape[1] == 2, f"Invalid shape for kv, got {kv.shape}, expected [N, *, 2, H, C]"
+    serialization_spatial_cache_name_q = f'window_partition_{window_size}_{shift_window}_cross_q' + cache_suffix
+    serialization_spatial_cache_q = q.get_spatial_cache(serialization_spatial_cache_name_q)
+    serialization_spatial_cache_name_kv = f'window_partition_{window_size}_{shift_window}_cross_kv' + cache_suffix
+    serialization_spatial_cache_kv = kv.get_spatial_cache(serialization_spatial_cache_name_kv)
+    if serialization_spatial_cache_q is None or serialization_spatial_cache_kv is None:
+        q_fwd_indices, q_bwd_indices, q_seq_lens, kv_fwd_indices, kv_bwd_indices, kv_seq_lens = calc_window_partition_cross(q, kv, window_size, shift_window)
+        q.register_spatial_cache(serialization_spatial_cache_name_q, (q_fwd_indices, q_bwd_indices, q_seq_lens))
+        kv.register_spatial_cache(serialization_spatial_cache_name_kv, (kv_fwd_indices, kv_bwd_indices, kv_seq_lens))
+    else:
+        kv_fwd_indices, kv_bwd_indices, kv_seq_lens = serialization_spatial_cache_kv
+        q_fwd_indices, q_bwd_indices, q_seq_lens = serialization_spatial_cache_q
+    M_q,  T_q,  H_q,  C_q  = q_fwd_indices.shape[0],  q.feats.shape[0],  q.feats.shape[2],  q.feats.shape[3]
+    M_kv, T_kv, H_kv, C_kv = kv_fwd_indices.shape[0], kv.feats.shape[0], kv.feats.shape[2], kv.feats.shape[3]
+    assert (H_q == H_kv and C_q == C_kv), \
+        f"Mismatch in shapes: q ({M_q}, {T_q}, {H_q}, {C_q}), kv ({M_kv}, {T_kv}, {H_kv}, {C_kv})"
+    q_feats  = q.feats[q_fwd_indices]        # [M, 1, H, C]
+    kv_feats = kv.feats[kv_fwd_indices]      # [M, 2, H, C]
+    if ATTN == 'xformers':
+        q, k, v = q_feats[:, 0], kv_feats.unbind(dim=1)         # [M, H, C]
+        q = q.unsqueeze(0)                                      # [1, M, H, C]
+        k = k.unsqueeze(0)                                      # [1, M, H, C]
+        v = v.unsqueeze(0)                                      # [1, M, H, C]
+        mask = xops.fmha.BlockDiagonalMask.from_seqlens(q_seq_lens, kv_seq_lens)
+        out = xops.memory_efficient_attention(q, k, v, mask)[0] # [M, H, C]
+    elif ATTN == 'flash_attn':
+        cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(q_seq_lens), dim=0)],  dim=0).to(q.device).int()
+        cu_seqlens_k = torch.cat([torch.tensor([0]), torch.cumsum(torch.tensor(kv_seq_lens), dim=0)], dim=0).to(kv.device).int()
+        out = flash_attn.flash_attn_varlen_kvpacked_func(q_feats[:, 0], kv_feats, cu_seqlens_q, cu_seqlens_k, max(q_seq_lens), max(kv_seq_lens)) # [M, H, C]
+    out = out[q_bwd_indices]      # [T, H, C]
+    return q.replace(out)

anigen/modules/sparse/basic.py ADDED Viewed

	@@ -0,0 +1,465 @@

+from typing import *
+import torch
+import torch.nn as nn
+from . import BACKEND, DEBUG
+SparseTensorData = None # Lazy import
+import importlib
+if BACKEND == 'torchsparse':
+    SparseTensorData = importlib.import_module('torchsparse').SparseTensor
+elif BACKEND == 'spconv':
+    SparseTensorData = importlib.import_module('spconv.pytorch').SparseConvTensor
+__all__ = [
+    'SparseTensor',
+    'sparse_batch_broadcast',
+    'sparse_batch_op',
+    'sparse_cat',
+    'sparse_unbind',
+]
+class SparseTensor:
+    """
+    Sparse tensor with support for both torchsparse and spconv backends.
+    Parameters:
+    - feats (torch.Tensor): Features of the sparse tensor.
+    - coords (torch.Tensor): Coordinates of the sparse tensor.
+    - shape (torch.Size): Shape of the sparse tensor.
+    - layout (List[slice]): Layout of the sparse tensor for each batch
+    - data (SparseTensorData): Sparse tensor data used for convolusion
+    NOTE:
+    - Data corresponding to a same batch should be contiguous.
+    - Coords should be in [0, 1023]
+    """
+    @overload
+    def __init__(self, feats: torch.Tensor, coords: torch.Tensor, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    @overload
+    def __init__(self, data, shape: Optional[torch.Size] = None, layout: Optional[List[slice]] = None, **kwargs): ...
+    def __init__(self, *args, **kwargs):
+        # Lazy import of sparse tensor backend
+        global SparseTensorData
+        if SparseTensorData is None:
+            import importlib
+            if BACKEND == 'torchsparse':
+                SparseTensorData = importlib.import_module('torchsparse').SparseTensor
+            elif BACKEND == 'spconv':
+                SparseTensorData = importlib.import_module('spconv.pytorch').SparseConvTensor
+        method_id = 0
+        if len(args) != 0:
+            method_id = 0 if isinstance(args[0], torch.Tensor) else 1
+        else:
+            method_id = 1 if 'data' in kwargs else 0
+        if method_id == 0:
+            feats, coords, shape, layout = args + (None,) * (4 - len(args))
+            if 'feats' in kwargs:
+                feats = kwargs['feats']
+                del kwargs['feats']
+            if 'coords' in kwargs:
+                coords = kwargs['coords']
+                del kwargs['coords']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            if shape is None:
+                shape = self.__cal_shape(feats, coords)
+            if layout is None:
+                layout = self.__cal_layout(coords, shape[0])
+            if BACKEND == 'torchsparse':
+                self.data = SparseTensorData(feats, coords, **kwargs)
+            elif BACKEND == 'spconv':
+                spatial_shape = list(coords.max(0)[0] + 1)[1:]
+                self.data = SparseTensorData(feats.reshape(feats.shape[0], -1), coords, spatial_shape, shape[0], **kwargs)
+                self.data._features = feats
+        elif method_id == 1:
+            data, shape, layout = args + (None,) * (3 - len(args))
+            if 'data' in kwargs:
+                data = kwargs['data']
+                del kwargs['data']
+            if 'shape' in kwargs:
+                shape = kwargs['shape']
+                del kwargs['shape']
+            if 'layout' in kwargs:
+                layout = kwargs['layout']
+                del kwargs['layout']
+            self.data = data
+            if shape is None:
+                shape = self.__cal_shape(self.feats, self.coords)
+            if layout is None:
+                layout = self.__cal_layout(self.coords, shape[0])
+        self._shape = shape
+        self._layout = layout
+        self._scale = kwargs.get('scale', (1, 1, 1))
+        self._spatial_cache = kwargs.get('spatial_cache', {})
+        if DEBUG:
+            try:
+                assert self.feats.shape[0] == self.coords.shape[0], f"Invalid feats shape: {self.feats.shape}, coords shape: {self.coords.shape}"
+                assert self.shape == self.__cal_shape(self.feats, self.coords), f"Invalid shape: {self.shape}"
+                assert self.layout == self.__cal_layout(self.coords, self.shape[0]), f"Invalid layout: {self.layout}"
+                for i in range(self.shape[0]):
+                    assert torch.all(self.coords[self.layout[i], 0] == i), f"The data of batch {i} is not contiguous"
+            except Exception as e:
+                print('Debugging information:')
+                print(f"- Shape: {self.shape}")
+                print(f"- Layout: {self.layout}")
+                print(f"- Scale: {self._scale}")
+                print(f"- Coords: {self.coords}")
+                raise e
+    def __cal_shape(self, feats, coords):
+        shape = []
+        shape.append(coords[:, 0].max().item() + 1)
+        shape.extend([*feats.shape[1:]])
+        return torch.Size(shape)
+    def __cal_layout(self, coords, batch_size):
+        seq_len = torch.bincount(coords[:, 0], minlength=batch_size)
+        offset = torch.cumsum(seq_len, dim=0)
+        layout = [slice((offset[i] - seq_len[i]).item(), offset[i].item()) for i in range(batch_size)]
+        return layout
+    @property
+    def shape(self) -> torch.Size:
+        return self._shape
+    def dim(self) -> int:
+        return len(self.shape)
+    @property
+    def layout(self) -> List[slice]:
+        return self._layout
+    @property
+    def feats(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.F
+        elif BACKEND == 'spconv':
+            return self.data.features
+    @feats.setter
+    def feats(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.F = value
+        elif BACKEND == 'spconv':
+            self.data.features = value
+    @property
+    def coords(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.C
+        elif BACKEND == 'spconv':
+            return self.data.indices
+    @coords.setter
+    def coords(self, value: torch.Tensor):
+        if BACKEND == 'torchsparse':
+            self.data.C = value
+        elif BACKEND == 'spconv':
+            self.data.indices = value
+    @property
+    def dtype(self):
+        return self.feats.dtype
+    @property
+    def device(self):
+        return self.feats.device
+    @overload
+    def to(self, dtype: torch.dtype) -> 'SparseTensor': ...
+    @overload
+    def to(self, device: Optional[Union[str, torch.device]] = None, dtype: Optional[torch.dtype] = None) -> 'SparseTensor': ...
+    def to(self, *args, **kwargs) -> 'SparseTensor':
+        device = None
+        dtype = None
+        if len(args) == 2:
+            device, dtype = args
+        elif len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype = args[0]
+            else:
+                device = args[0]
+        if 'dtype' in kwargs:
+            assert dtype is None, "to() received multiple values for argument 'dtype'"
+            dtype = kwargs['dtype']
+        if 'device' in kwargs:
+            assert device is None, "to() received multiple values for argument 'device'"
+            device = kwargs['device']
+        new_feats = self.feats.to(device=device, dtype=dtype)
+        new_coords = self.coords.to(device=device)
+        return self.replace(new_feats, new_coords)
+    def type(self, dtype):
+        new_feats = self.feats.type(dtype)
+        return self.replace(new_feats)
+    def cpu(self) -> 'SparseTensor':
+        new_feats = self.feats.cpu()
+        new_coords = self.coords.cpu()
+        return self.replace(new_feats, new_coords)
+    def cuda(self) -> 'SparseTensor':
+        new_feats = self.feats.cuda()
+        new_coords = self.coords.cuda()
+        return self.replace(new_feats, new_coords)
+    def half(self) -> 'SparseTensor':
+        new_feats = self.feats.half()
+        return self.replace(new_feats)
+    def float(self) -> 'SparseTensor':
+        new_feats = self.feats.float()
+        return self.replace(new_feats)
+    def detach(self) -> 'SparseTensor':
+        new_coords = self.coords.detach()
+        new_feats = self.feats.detach()
+        return self.replace(new_feats, new_coords)
+    def dense(self) -> torch.Tensor:
+        if BACKEND == 'torchsparse':
+            return self.data.dense()
+        elif BACKEND == 'spconv':
+            return self.data.dense()
+    def reshape(self, *shape) -> 'SparseTensor':
+        new_feats = self.feats.reshape(self.feats.shape[0], *shape)
+        return self.replace(new_feats)
+    def unbind(self, dim: int) -> List['SparseTensor']:
+        return sparse_unbind(self, dim)
+    def replace(self, feats: torch.Tensor, coords: Optional[torch.Tensor] = None) -> 'SparseTensor':
+        new_shape = [self.shape[0]]
+        new_shape.extend(feats.shape[1:])
+        if BACKEND == 'torchsparse':
+            new_data = SparseTensorData(
+                feats=feats,
+                coords=self.data.coords if coords is None else coords,
+                stride=self.data.stride,
+                spatial_range=self.data.spatial_range,
+            )
+            new_data._caches = self.data._caches
+        elif BACKEND == 'spconv':
+            new_data = SparseTensorData(
+                self.data.features.reshape(self.data.features.shape[0], -1),
+                self.data.indices,
+                self.data.spatial_shape,
+                self.data.batch_size,
+                self.data.grid,
+                self.data.voxel_num,
+                self.data.indice_dict
+            )
+            new_data._features = feats
+            new_data.benchmark = self.data.benchmark
+            new_data.benchmark_record = self.data.benchmark_record
+            new_data.thrust_allocator = self.data.thrust_allocator
+            new_data._timer = self.data._timer
+            new_data.force_algo = self.data.force_algo
+            new_data.int8_scale = self.data.int8_scale
+            if coords is not None:
+                new_data.indices = coords
+        new_tensor = SparseTensor(new_data, shape=torch.Size(new_shape), layout=self.layout, scale=self._scale, spatial_cache=self._spatial_cache)
+        return new_tensor
+    @staticmethod
+    def full(aabb, dim, value, dtype=torch.float32, device=None) -> 'SparseTensor':
+        N, C = dim
+        x = torch.arange(aabb[0], aabb[3] + 1)
+        y = torch.arange(aabb[1], aabb[4] + 1)
+        z = torch.arange(aabb[2], aabb[5] + 1)
+        coords = torch.stack(torch.meshgrid(x, y, z, indexing='ij'), dim=-1).reshape(-1, 3)
+        coords = torch.cat([
+            torch.arange(N).view(-1, 1).repeat(1, coords.shape[0]).view(-1, 1),
+            coords.repeat(N, 1),
+        ], dim=1).to(dtype=torch.int32, device=device)
+        feats = torch.full((coords.shape[0], C), value, dtype=dtype, device=device)
+        return SparseTensor(feats=feats, coords=coords)
+    def __merge_sparse_cache(self, other: 'SparseTensor') -> dict:
+        new_cache = {}
+        for k in set(list(self._spatial_cache.keys()) + list(other._spatial_cache.keys())):
+            if k in self._spatial_cache:
+                new_cache[k] = self._spatial_cache[k]
+            if k in other._spatial_cache:
+                if k not in new_cache:
+                    new_cache[k] = other._spatial_cache[k]
+                else:
+                    new_cache[k].update(other._spatial_cache[k])
+        return new_cache
+    def __neg__(self) -> 'SparseTensor':
+        return self.replace(-self.feats)
+    def __elemwise__(self, other: Union[torch.Tensor, 'SparseTensor'], op: callable) -> 'SparseTensor':
+        if isinstance(other, torch.Tensor):
+            try:
+                other = torch.broadcast_to(other, self.shape)
+                other = sparse_batch_broadcast(self, other)
+            except:
+                pass
+        if isinstance(other, SparseTensor):
+            other = other.feats
+        new_feats = op(self.feats, other)
+        new_tensor = self.replace(new_feats)
+        if isinstance(other, SparseTensor):
+            new_tensor._spatial_cache = self.__merge_sparse_cache(other)
+        return new_tensor
+    def __add__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __radd__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.add)
+    def __sub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.sub)
+    def __rsub__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.sub(y, x))
+    def __mul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __rmul__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.mul)
+    def __truediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, torch.div)
+    def __rtruediv__(self, other: Union[torch.Tensor, 'SparseTensor', float]) -> 'SparseTensor':
+        return self.__elemwise__(other, lambda x, y: torch.div(y, x))
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            idx = [idx]
+        elif isinstance(idx, slice):
+            idx = range(*idx.indices(self.shape[0]))
+        elif isinstance(idx, torch.Tensor):
+            if idx.dtype == torch.bool:
+                assert idx.shape == (self.shape[0],), f"Invalid index shape: {idx.shape}"
+                idx = idx.nonzero().squeeze(1)
+            elif idx.dtype in [torch.int32, torch.int64]:
+                assert len(idx.shape) == 1, f"Invalid index shape: {idx.shape}"
+            else:
+                raise ValueError(f"Unknown index type: {idx.dtype}")
+        else:
+            raise ValueError(f"Unknown index type: {type(idx)}")
+        coords = []
+        feats = []
+        for new_idx, old_idx in enumerate(idx):
+            coords.append(self.coords[self.layout[old_idx]].clone())
+            coords[-1][:, 0] = new_idx
+            feats.append(self.feats[self.layout[old_idx]])
+        coords = torch.cat(coords, dim=0).contiguous()
+        feats = torch.cat(feats, dim=0).contiguous()
+        return SparseTensor(feats=feats, coords=coords)
+    def register_spatial_cache(self, key, value) -> None:
+        """
+        Register a spatial cache.
+        The spatial cache can be any thing you want to cache.
+        The registery and retrieval of the cache is based on current scale.
+        """
+        scale_key = str(self._scale)
+        if scale_key not in self._spatial_cache:
+            self._spatial_cache[scale_key] = {}
+        self._spatial_cache[scale_key][key] = value
+    def get_spatial_cache(self, key=None):
+        """
+        Get a spatial cache.
+        """
+        scale_key = str(self._scale)
+        cur_scale_cache = self._spatial_cache.get(scale_key, {})
+        if key is None:
+            return cur_scale_cache
+        return cur_scale_cache.get(key, None)
+def sparse_batch_broadcast(input: SparseTensor, other: torch.Tensor) -> torch.Tensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    coords, feats = input.coords, input.feats
+    broadcasted = torch.zeros_like(feats)
+    for k in range(input.shape[0]):
+        broadcasted[input.layout[k]] = other[k]
+    return broadcasted
+def sparse_batch_op(input: SparseTensor, other: torch.Tensor, op: callable = torch.add) -> SparseTensor:
+    """
+    Broadcast a 1D tensor to a sparse tensor along the batch dimension then perform an operation.
+    Args:
+        input (torch.Tensor): 1D tensor to broadcast.
+        target (SparseTensor): Sparse tensor to broadcast to.
+        op (callable): Operation to perform after broadcasting. Defaults to torch.add.
+    """
+    return input.replace(op(input.feats, sparse_batch_broadcast(input, other)))
+def sparse_cat(inputs: List[SparseTensor], dim: int = 0) -> SparseTensor:
+    """
+    Concatenate a list of sparse tensors.
+    Args:
+        inputs (List[SparseTensor]): List of sparse tensors to concatenate.
+    """
+    if dim == 0:
+        start = 0
+        coords = []
+        for input in inputs:
+            coords.append(input.coords.clone())
+            coords[-1][:, 0] += start
+            start += input.shape[0]
+        coords = torch.cat(coords, dim=0)
+        feats = torch.cat([input.feats for input in inputs], dim=0)
+        output = SparseTensor(
+            coords=coords,
+            feats=feats,
+        )
+    else:
+        feats = torch.cat([input.feats for input in inputs], dim=dim)
+        output = inputs[0].replace(feats)
+    return output
+def sparse_unbind(input: SparseTensor, dim: int) -> List[SparseTensor]:
+    """
+    Unbind a sparse tensor along a dimension.
+    Args:
+        input (SparseTensor): Sparse tensor to unbind.
+        dim (int): Dimension to unbind.
+    """
+    if dim == 0:
+        return [input[i] for i in range(input.shape[0])]
+    else:
+        feats = input.feats.unbind(dim)
+        return [input.replace(f) for f in feats]

anigen/modules/sparse/conv/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .. import BACKEND
+SPCONV_ALGO = 'auto'    # 'auto', 'implicit_gemm', 'native'
+def __from_env():
+    import os
+    global SPCONV_ALGO
+    env_spconv_algo = os.environ.get('SPCONV_ALGO')
+    if env_spconv_algo is not None and env_spconv_algo in ['auto', 'implicit_gemm', 'native']:
+        SPCONV_ALGO = env_spconv_algo
+    print(f"[SPARSE][CONV] spconv algo: {SPCONV_ALGO}")
+__from_env()
+if BACKEND == 'torchsparse':
+    from .conv_torchsparse import *
+elif BACKEND == 'spconv':
+    from .conv_spconv import *

anigen/modules/sparse/conv/conv_spconv.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+import torch.nn as nn
+from .. import SparseTensor
+from .. import DEBUG
+from . import SPCONV_ALGO
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, padding=None, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        algo = None
+        if SPCONV_ALGO == 'native':
+            algo = spconv.ConvAlgo.Native
+        elif SPCONV_ALGO == 'implicit_gemm':
+            algo = spconv.ConvAlgo.MaskImplicitGemm
+        if stride == 1 and (padding is None):
+            self.conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, dilation=dilation, bias=bias, indice_key=indice_key, algo=algo)
+        else:
+            self.conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=padding, bias=bias, indice_key=indice_key, algo=algo)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+        self.padding = padding
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride) or (self.padding is not None)
+        new_data = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        if spatial_changed and (x.shape[0] != 1):
+            # spconv was non-1 stride will break the contiguous of the output tensor, sort by the coords
+            fwd = new_data.indices[:, 0].argsort()
+            bwd = torch.zeros_like(fwd).scatter_(0, fwd, torch.arange(fwd.shape[0], device=fwd.device))
+            sorted_feats = new_data.features[fwd]
+            sorted_coords = new_data.indices[fwd]
+            unsorted_data = new_data
+            new_data = spconv.SparseConvTensor(sorted_feats, sorted_coords, unsorted_data.spatial_shape, unsorted_data.batch_size)  # type: ignore
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s * stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        if spatial_changed and (x.shape[0] != 1):
+            out.register_spatial_cache(f'conv_{self.stride}_unsorted_data', unsorted_data)
+            out.register_spatial_cache(f'conv_{self.stride}_sort_bwd', bwd)
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'spconv' not in globals():
+            import spconv.pytorch as spconv
+        self.conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, bias=bias, indice_key=indice_key)
+        self.stride = tuple(stride) if isinstance(stride, (list, tuple)) else (stride, stride, stride)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        spatial_changed = any(s != 1 for s in self.stride)
+        if spatial_changed:
+            # recover the original spconv order
+            data = x.get_spatial_cache(f'conv_{self.stride}_unsorted_data')
+            bwd = x.get_spatial_cache(f'conv_{self.stride}_sort_bwd')
+            data = data.replace_feature(x.feats[bwd])
+            if DEBUG:
+                assert torch.equal(data.indices, x.coords[bwd]), 'Recover the original order failed'
+        else:
+            data = x.data
+        new_data = self.conv(data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        new_layout = None if spatial_changed else x.layout
+        out = SparseTensor(
+            new_data, shape=torch.Size(new_shape), layout=new_layout,
+            scale=tuple([s // stride for s, stride in zip(x._scale, self.stride)]),
+            spatial_cache=x._spatial_cache,
+        )
+        return out

anigen/modules/sparse/conv/conv_torchsparse.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import torch.nn as nn
+from .. import SparseTensor
+class SparseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s * stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out
+class SparseInverseConv3d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, bias=True, indice_key=None):
+        super(SparseInverseConv3d, self).__init__()
+        if 'torchsparse' not in globals():
+            import torchsparse
+        self.conv = torchsparse.nn.Conv3d(in_channels, out_channels, kernel_size, stride, 0, dilation, bias, transposed=True)
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        out = self.conv(x.data)
+        new_shape = [x.shape[0], self.conv.out_channels]
+        out = SparseTensor(out, shape=torch.Size(new_shape), layout=x.layout if all(s == 1 for s in self.conv.stride) else None)
+        out._spatial_cache = x._spatial_cache
+        out._scale = tuple([s // stride for s, stride in zip(x._scale, self.conv.stride)])
+        return out

anigen/modules/sparse/linear.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseLinear'
+]
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True):
+        super(SparseLinear, self).__init__(in_features, out_features, bias)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))

anigen/modules/sparse/nonlinearity.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseReLU',
+    'SparseSiLU',
+    'SparseGELU',
+    'SparseActivation'
+]
+class SparseReLU(nn.ReLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseSiLU(nn.SiLU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseGELU(nn.GELU):
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(super().forward(input.feats))
+class SparseActivation(nn.Module):
+    def __init__(self, activation: nn.Module):
+        super().__init__()
+        self.activation = activation
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        return input.replace(self.activation(input.feats))

anigen/modules/sparse/norm.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+from . import SparseTensor
+from . import DEBUG
+__all__ = [
+    'SparseGroupNorm',
+    'SparseLayerNorm',
+    'SparseGroupNorm32',
+    'SparseLayerNorm32',
+]
+class SparseGroupNorm(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True):
+        super(SparseGroupNorm, self).__init__(num_groups, num_channels, eps, affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            if DEBUG:
+                assert (input.coords[input.layout[k], 0] == k).all(), f"SparseGroupNorm: batch index mismatch"
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseLayerNorm(nn.LayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(SparseLayerNorm, self).__init__(normalized_shape, eps, elementwise_affine)
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        nfeats = torch.zeros_like(input.feats)
+        for k in range(input.shape[0]):
+            bfeats = input.feats[input.layout[k]]
+            bfeats = bfeats.permute(1, 0).reshape(1, input.shape[1], -1)
+            bfeats = super().forward(bfeats)
+            bfeats = bfeats.reshape(input.shape[1], -1).permute(1, 0)
+            nfeats[input.layout[k]] = bfeats
+        return input.replace(nfeats)
+class SparseGroupNorm32(SparseGroupNorm):
+    """
+    A GroupNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)
+class SparseLayerNorm32(SparseLayerNorm):
+    """
+    A LayerNorm layer that converts to float32 before the forward pass.
+    """
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return super().forward(x.float()).type(x.dtype)

anigen/modules/sparse/spatial.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import *
+import torch
+import torch.nn as nn
+from . import SparseTensor
+__all__ = [
+    'SparseDownsample',
+    'SparseUpsample',
+    'SparseSubdivide'
+]
+class SparseDownsample(nn.Module):
+    """
+    Downsample a sparse tensor by a factor of `factor`.
+    Implemented as average pooling.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, ...], List[int]]):
+        super(SparseDownsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the downsample factor.'
+        coord = list(input.coords.unbind(dim=-1))
+        for i, f in enumerate(factor):
+            coord[i+1] = coord[i+1] // f
+        MAX = [coord[i+1].max().item() + 1 for i in range(DIM)]
+        OFFSET = torch.cumprod(torch.tensor(MAX[::-1]), 0).tolist()[::-1] + [1]
+        code = sum([c * o for c, o in zip(coord, OFFSET)])
+        code, idx = code.unique(return_inverse=True)
+        new_feats = torch.scatter_reduce(
+            torch.zeros(code.shape[0], input.feats.shape[1], device=input.feats.device, dtype=input.feats.dtype),
+            dim=0,
+            index=idx.unsqueeze(1).expand(-1, input.feats.shape[1]),
+            src=input.feats,
+            reduce='mean'
+        )
+        new_coords = torch.stack(
+            [code // OFFSET[0]] +
+            [(code // OFFSET[i+1]) % MAX[i] for i in range(DIM)],
+            dim=-1
+        )
+        out = SparseTensor(new_feats, new_coords, input.shape,)
+        out._scale = tuple([s // f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        out.register_spatial_cache(f'upsample_{factor}_coords', input.coords)
+        out.register_spatial_cache(f'upsample_{factor}_layout', input.layout)
+        out.register_spatial_cache(f'upsample_{factor}_idx', idx)
+        return out
+class SparseUpsample(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self, factor: Union[int, Tuple[int, int, int], List[int]]):
+        super(SparseUpsample, self).__init__()
+        self.factor = tuple(factor) if isinstance(factor, (list, tuple)) else factor
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        factor = self.factor if isinstance(self.factor, tuple) else (self.factor,) * DIM
+        assert DIM == len(factor), 'Input coordinates must have the same dimension as the upsample factor.'
+        new_coords = input.get_spatial_cache(f'upsample_{factor}_coords')
+        new_layout = input.get_spatial_cache(f'upsample_{factor}_layout')
+        idx = input.get_spatial_cache(f'upsample_{factor}_idx')
+        if any([x is None for x in [new_coords, new_layout, idx]]):
+            raise ValueError('Upsample cache not found. SparseUpsample must be paired with SparseDownsample.')
+        new_feats = input.feats[idx]
+        out = SparseTensor(new_feats, new_coords, input.shape, new_layout)
+        out._scale = tuple([s * f for s, f in zip(input._scale, factor)])
+        out._spatial_cache = input._spatial_cache
+        return out
+class SparseSubdivide(nn.Module):
+    """
+    Upsample a sparse tensor by a factor of `factor`.
+    Implemented as nearest neighbor interpolation.
+    """
+    def __init__(self):
+        super(SparseSubdivide, self).__init__()
+    def forward(self, input: SparseTensor) -> SparseTensor:
+        DIM = input.coords.shape[-1] - 1
+        # upsample scale=2^DIM
+        n_cube = torch.ones([2] * DIM, device=input.device, dtype=torch.int)
+        n_coords = torch.nonzero(n_cube)
+        n_coords = torch.cat([torch.zeros_like(n_coords[:, :1]), n_coords], dim=-1)
+        factor = n_coords.shape[0]
+        assert factor == 2 ** DIM
+        # print(n_coords.shape)
+        new_coords = input.coords.clone()
+        new_coords[:, 1:] *= 2
+        new_coords = new_coords.unsqueeze(1) + n_coords.unsqueeze(0).to(new_coords.dtype)
+        new_feats = input.feats.unsqueeze(1).expand(input.feats.shape[0], factor, *input.feats.shape[1:])
+        out = SparseTensor(new_feats.flatten(0, 1), new_coords.flatten(0, 1), input.shape)
+        out._scale = input._scale * 2
+        out._spatial_cache = input._spatial_cache
+        return out

anigen/modules/sparse/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .blocks import *
+from .modulated import *
+from .anigen_modulated import *

anigen/modules/sparse/transformer/anigen_modulated.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+from .blocks import SparseFeedForwardNet
+class AniGenModulatedSparseTransformerCrossBlock(nn.Module):
+    """
+    AniGen Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        channels_skl: int,
+        ctx_channels: int,
+        num_heads: int,
+        num_heads_skl: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm1_skl = LayerNorm32(channels_skl, elementwise_affine=False, eps=1e-6)
+        self.norm2_skl = LayerNorm32(channels_skl, elementwise_affine=True, eps=1e-6)
+        self.norm3_skl = LayerNorm32(channels_skl, elementwise_affine=False, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=channels_skl,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.attn_skl = SparseMultiHeadAttention(
+            channels_skl,
+            ctx_channels=channels,
+            num_heads=num_heads_skl,
+            type="cross",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.context_cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.context_cross_attn_skl = SparseMultiHeadAttention(
+            channels_skl,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads_skl,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        self.mlp_skl = SparseFeedForwardNet(
+            channels_skl,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+            self.adaLN_modulation_skl = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels_skl, 6 * channels_skl, bias=True)
+            )
+    def _forward(self, x: SparseTensor, x_skl: SparseTensor, mod: torch.Tensor, mod_skl: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+            shift_msa_skl, scale_msa_skl, gate_msa_skl, shift_mlp_skl, scale_mlp_skl, gate_mlp_skl = mod_skl.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+            shift_msa_skl, scale_msa_skl, gate_msa_skl, shift_mlp_skl, scale_mlp_skl, gate_mlp_skl = self.adaLN_modulation_skl(mod_skl).chunk(6, dim=1)
+        # Input Norm
+        h = x.replace(self.norm1(x.feats))
+        h_skl = x_skl.replace(self.norm1_skl(x_skl.feats))
+        # AdaLN (By Time Step)
+        h = h * (1 + scale_msa) + shift_msa
+        h_skl = h_skl * (1 + scale_msa_skl) + shift_msa_skl
+        # Self Attn (Cross shape and skeleton)
+        h = self.attn(h, h_skl)
+        h_skl = self.attn_skl(h_skl, h)
+        # Gated Residual (By Time Step)
+        h = h * gate_msa
+        h_skl = h_skl * gate_msa_skl
+        x = x + h
+        x_skl = x_skl + h_skl
+        # Context Cross Attention
+        h = x.replace(self.norm2(x.feats))
+        h_skl = x_skl.replace(self.norm2_skl(x_skl.feats))
+        h = self.context_cross_attn(h, context)
+        h_skl = self.context_cross_attn_skl(h_skl, context)
+        x = x + h
+        x_skl = x_skl + h_skl
+        # Re-Centered
+        h = x.replace(self.norm3(x.feats))
+        h_skl = x_skl.replace(self.norm3_skl(x_skl.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h_skl = h_skl * (1 + scale_mlp_skl) + shift_mlp_skl
+        # Output MLP
+        h = self.mlp(h)
+        h_skl = self.mlp_skl(h_skl)
+        # Gated Residual (By Time Step)
+        h = h * gate_mlp
+        h_skl = h_skl * gate_mlp_skl
+        x = x + h
+        x_skl = x_skl + h_skl
+        return x, x_skl
+    def forward(self, x: SparseTensor, x_skl: SparseTensor, mod: torch.Tensor, mod_skl: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, x_skl, mod, mod_skl, context, use_reentrant=False)
+        else:
+            return self._forward(x, x_skl, mod, mod_skl, context)

anigen/modules/sparse/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..linear import SparseLinear
+from ..nonlinearity import SparseGELU
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+class SparseFeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            SparseLinear(channels, int(channels * mlp_ratio)),
+            SparseGELU(approximate="tanh"),
+            SparseLinear(int(channels * mlp_ratio), channels),
+        )
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        return self.mlp(x)
+class SparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor) -> SparseTensor:
+        # Self-attention
+        h = x.replace(self.norm1(x.feats))
+        h = self.attn(h)
+        x = x + h
+        # Feed-forward network
+        h = x.replace(self.norm2(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "serialized", "windowed"]  = "full",
+        attn_mode_cross: Literal["full", "serialized", "windowed"]  = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+        context_is_dual: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.context_is_dual = context_is_dual
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        if context_is_dual:
+            self.norm4 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode=attn_mode_cross,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor, context: SparseTensor):
+        # Self-attention
+        h = x.replace(self.norm1(x.feats))
+        h = self.self_attn(h)
+        x = x + h
+        # Cross-attention
+        h = x.replace(self.norm2(x.feats))
+        if self.context_is_dual:
+            context = context.replace(self.norm4(context.feats))
+        h = self.cross_attn(h, context)
+        x = x + h
+        # Feed-forward network
+        h = x.replace(self.norm3(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, context: SparseTensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)
+class SparseTransformerMultiContextCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: List[int],
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "serialized", "windowed"]  = "full",
+        attn_mode_cross: Literal["full", "serialized", "windowed"]  = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+        cross_attn_cache_suffix: str = '',
+    ):
+        super().__init__()
+        self.context_num = len(ctx_channels)
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        if self.context_num > 0:
+            self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        for i in range(self.context_num):
+            setattr(self, f'cross_attn_{i}', SparseMultiHeadAttention(
+                channels,
+                ctx_channels=ctx_channels[i],
+                num_heads=num_heads,
+                type="cross",
+                attn_mode=attn_mode_cross,
+                window_size=window_size,
+                shift_sequence=shift_sequence,
+                shift_window=shift_window,
+                serialize_mode=serialize_mode,
+                qkv_bias=qkv_bias,
+                qk_rms_norm=qk_rms_norm_cross,
+                cross_attn_cache_suffix=cross_attn_cache_suffix + f'_modality_{i}'
+            ))
+            setattr(self, f'ctx_norm_{i}', LayerNorm32(ctx_channels[i], elementwise_affine=True, eps=1e-6))
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+    def _forward(self, x: SparseTensor, contexts: List[SparseTensor]) -> SparseTensor:
+        # Self-attention
+        h = x.replace(self.norm1(x.feats))
+        h = self.self_attn(h)
+        x = x + h
+        # Cross-attention
+        if self.context_num > 0 and len(contexts) > 0:
+            h_norm = x.replace(self.norm2(x.feats))
+            for i, context in enumerate(contexts):
+                context = context.replace(getattr(self, f'ctx_norm_{i}')(context.feats))
+                h = getattr(self, f'cross_attn_{i}')(h_norm, context)
+                x = x + h
+        # Feed-forward network
+        h = x.replace(self.norm3(x.feats))
+        h = self.mlp(h)
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, contexts: List[SparseTensor]) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, contexts, use_reentrant=False)
+        else:
+            return self._forward(x, contexts)

anigen/modules/sparse/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..basic import SparseTensor
+from ..attention import SparseMultiHeadAttention, SerializeMode
+from ...norm import LayerNorm32
+from .blocks import SparseFeedForwardNet
+class ModulatedSparseTransformerBlock(nn.Module):
+    """
+    Sparse Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedSparseTransformerCrossBlock(nn.Module):
+    """
+    Sparse Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
+        window_size: Optional[int] = None,
+        shift_sequence: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        serialize_mode: Optional[SerializeMode] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+        norm_for_context: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm_for_context = norm_for_context
+        if self.norm_for_context:
+            self.context_norm = LayerNorm32(ctx_channels, elementwise_affine=True, eps=1e-6)
+        self.self_attn = SparseMultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_sequence=shift_sequence,
+            shift_window=shift_window,
+            serialize_mode=serialize_mode,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.cross_attn = SparseMultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+        )
+        self.mlp = SparseFeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = x.replace(self.norm1(x.feats))
+        h = h * (1 + scale_msa) + shift_msa
+        h = self.self_attn(h)
+        h = h * gate_msa
+        x = x + h
+        h = x.replace(self.norm2(x.feats))
+        if self.norm_for_context:
+            if isinstance(context, SparseTensor):
+                context = context.replace(self.context_norm(context.feats))
+            else:
+                context = self.context_norm(context)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = x.replace(self.norm3(x.feats))
+        h = h * (1 + scale_mlp) + shift_mlp
+        h = self.mlp(h)
+        h = h * gate_mlp
+        x = x + h
+        return x
+    def forward(self, x: SparseTensor, mod: torch.Tensor, context: torch.Tensor) -> SparseTensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)

anigen/modules/spatial.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+def pixel_shuffle_3d(x: torch.Tensor, scale_factor: int) -> torch.Tensor:
+    """
+    3D pixel shuffle.
+    """
+    B, C, H, W, D = x.shape
+    C_ = C // scale_factor**3
+    x = x.reshape(B, C_, scale_factor, scale_factor, scale_factor, H, W, D)
+    x = x.permute(0, 1, 5, 2, 6, 3, 7, 4)
+    x = x.reshape(B, C_, H*scale_factor, W*scale_factor, D*scale_factor)
+    return x
+def patchify(x: torch.Tensor, patch_size: int):
+    """
+    Patchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    for d in range(2, DIM + 2):
+        assert x.shape[d] % patch_size == 0, f"Dimension {d} of input tensor must be divisible by patch size, got {x.shape[d]} and {patch_size}"
+    x = x.reshape(*x.shape[:2], *sum([[x.shape[d] // patch_size, patch_size] for d in range(2, DIM + 2)], []))
+    x = x.permute(0, 1, *([2 * i + 3 for i in range(DIM)] + [2 * i + 2 for i in range(DIM)]))
+    x = x.reshape(x.shape[0], x.shape[1] * (patch_size ** DIM), *(x.shape[-DIM:]))
+    return x
+def unpatchify(x: torch.Tensor, patch_size: int):
+    """
+    Unpatchify a tensor.
+    Args:
+        x (torch.Tensor): (N, C, *spatial) tensor
+        patch_size (int): Patch size
+    """
+    DIM = x.dim() - 2
+    assert x.shape[1] % (patch_size ** DIM) == 0, f"Second dimension of input tensor must be divisible by patch size to unpatchify, got {x.shape[1]} and {patch_size ** DIM}"
+    x = x.reshape(x.shape[0], x.shape[1] // (patch_size ** DIM), *([patch_size] * DIM), *(x.shape[-DIM:]))
+    x = x.permute(0, 1, *(sum([[2 + DIM + i, 2 + i] for i in range(DIM)], [])))
+    x = x.reshape(x.shape[0], x.shape[1], *[x.shape[2 + 2 * i] * patch_size for i in range(DIM)])
+    return x

anigen/modules/transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .blocks import *
2	+ from .modulated import *

anigen/modules/transformer/blocks.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+class AbsolutePositionEmbedder(nn.Module):
+    """
+    Embeds spatial positions into vector representations.
+    """
+    def __init__(self, channels: int, in_channels: int = 3):
+        super().__init__()
+        self.channels = channels
+        self.in_channels = in_channels
+        self.freq_dim = channels // in_channels // 2
+        self.freqs = torch.arange(self.freq_dim, dtype=torch.float32) / self.freq_dim
+        self.freqs = 1.0 / (10000 ** self.freqs)
+    def _sin_cos_embedding(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Create sinusoidal position embeddings.
+        Args:
+            x: a 1-D Tensor of N indices
+        Returns:
+            an (N, D) Tensor of positional embeddings.
+        """
+        self.freqs = self.freqs.to(x.device)
+        out = torch.outer(x, self.freqs)
+        out = torch.cat([torch.sin(out), torch.cos(out)], dim=-1)
+        return out
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): (N, D) tensor of spatial positions
+        """
+        N, D = x.shape
+        assert D == self.in_channels, "Input dimension must match number of input channels"
+        embed = self._sin_cos_embedding(x.reshape(-1))
+        embed = embed.reshape(N, -1)
+        if embed.shape[1] < self.channels:
+            embed = torch.cat([embed, torch.zeros(N, self.channels - embed.shape[1], device=embed.device)], dim=-1)
+        return embed
+class FeedForwardNet(nn.Module):
+    def __init__(self, channels: int, mlp_ratio: float = 4.0, out_channels: Optional[int] = None):
+        super().__init__()
+        if out_channels is None:
+            out_channels = channels
+        self.mlp = nn.Sequential(
+            nn.Linear(channels, int(channels * mlp_ratio)),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(int(channels * mlp_ratio), out_channels),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.mlp(x)
+class TransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        out_channels: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[int] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.channels = channels
+        self.out_channels = out_channels if out_channels is not None else channels
+        self.mlp = FeedForwardNet(
+            self.channels,
+            out_channels=self.out_channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if self.out_channels != self.channels:
+            self.res_mlp = FeedForwardNet(
+                self.channels,
+                out_channels=self.out_channels,
+                mlp_ratio=1.0,
+            )
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = self.norm1(x)
+        h = self.attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.mlp(h)
+        if self.out_channels != self.channels:
+            x = self.res_mlp(x)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, use_reentrant=False)
+        else:
+            return self._forward(x)
+class SkinTransformerCrossBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        out_channels: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        use_checkpoint: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.to_v = nn.Linear(channels, channels, bias=qkv_bias)
+        self.channels = channels
+        self.out_channels = out_channels if out_channels is not None else channels
+        self.mlp = FeedForwardNet(
+            self.channels,
+            out_channels=self.out_channels,
+            mlp_ratio=mlp_ratio,
+        )
+        self.joint_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+        )
+        self.joint_mlp = FeedForwardNet(
+            self.channels,
+            out_channels=self.out_channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if self.out_channels != self.channels:
+            self.res_mlp = FeedForwardNet(
+                self.channels,
+                out_channels=self.out_channels,
+                mlp_ratio=1.0,
+            )
+            self.res_joint_mlp = FeedForwardNet(
+                self.channels,
+                out_channels=self.out_channels,
+                mlp_ratio=1.0,
+            )
+    def _forward(self, x: torch.Tensor, j: torch.Tensor, skin: torch.Tensor) -> torch.Tensor:
+        v = self.to_v(self.norm1(j))
+        h = skin @ v
+        x = x + h
+        h = self.norm2(x)
+        h = self.mlp(h)
+        if self.out_channels != self.channels:
+            x = self.res_mlp(x)
+        x = x + h
+        h_j = self.norm3(j)
+        h_j = self.joint_attn(h_j)
+        h_j = j + h_j
+        h_j = self.joint_mlp(h_j)
+        if self.out_channels != self.channels:
+            j = self.res_joint_mlp(j)
+        j = j + h_j
+        return x, j
+class TransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN).
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        out_channels: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        ln_affine: bool = False,
+        x_is_query: bool = False,
+        no_self: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6) if not no_self else nn.Identity()
+        self.norm2 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=ln_affine, eps=1e-6)
+        if no_self:
+            self.self_attn = lambda x: 0
+        else:
+            self.self_attn = MultiHeadAttention(
+                channels,
+                num_heads=num_heads,
+                type="self",
+                attn_mode=attn_mode,
+                window_size=window_size,
+                shift_window=shift_window,
+                qkv_bias=qkv_bias,
+                use_rope=use_rope,
+                qk_rms_norm=qk_rms_norm,
+            )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+            x_is_query=x_is_query,
+        )
+        self.channels = channels
+        self.out_channels = out_channels if out_channels is not None else channels
+        self.mlp = FeedForwardNet(
+            channels,
+            out_channels=self.out_channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if self.out_channels != self.channels:
+            self.res_mlp = FeedForwardNet(
+                self.channels,
+                out_channels=self.out_channels,
+                mlp_ratio=1.0,
+            )
+    def _forward(self, x: torch.Tensor, context: torch.Tensor):
+        h = self.norm1(x)
+        h = self.self_attn(h)
+        x = x + h
+        h = self.norm2(x)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = self.mlp(h)
+        if self.out_channels != self.channels:
+            x = self.res_mlp(x)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, context, use_reentrant=False)
+        else:
+            return self._forward(x, context)

anigen/modules/transformer/modulated.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from typing import *
+import torch
+import torch.nn as nn
+from ..attention import MultiHeadAttention
+from ..norm import LayerNorm32
+from .blocks import FeedForwardNet
+class ModulatedTransformerBlock(nn.Module):
+    """
+    Transformer block (MSA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor:
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, use_reentrant=False)
+        else:
+            return self._forward(x, mod)
+class ModulatedTransformerCrossBlock(nn.Module):
+    """
+    Transformer cross-attention block (MSA + MCA + FFN) with adaptive layer norm conditioning.
+    """
+    def __init__(
+        self,
+        channels: int,
+        ctx_channels: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        attn_mode: Literal["full", "windowed"] = "full",
+        window_size: Optional[int] = None,
+        shift_window: Optional[Tuple[int, int, int]] = None,
+        use_checkpoint: bool = False,
+        use_rope: bool = False,
+        qk_rms_norm: bool = False,
+        qk_rms_norm_cross: bool = False,
+        qkv_bias: bool = True,
+        share_mod: bool = False,
+        use_lora_self: bool = False,
+        lora_rank_self: int = 4,
+        use_lora_cross: bool = False,
+        lora_rank_cross: int = 4,
+        lora_lr_rate: float = 1.0,
+        use_context_norm: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.share_mod = share_mod
+        self.norm1 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.norm2 = LayerNorm32(channels, elementwise_affine=True, eps=1e-6)
+        self.norm3 = LayerNorm32(channels, elementwise_affine=False, eps=1e-6)
+        self.use_context_norm = use_context_norm
+        if self.use_context_norm:
+            self.context_norm = LayerNorm32(ctx_channels, elementwise_affine=True, eps=1e-6)
+        self.self_attn = MultiHeadAttention(
+            channels,
+            num_heads=num_heads,
+            type="self",
+            attn_mode=attn_mode,
+            window_size=window_size,
+            shift_window=shift_window,
+            qkv_bias=qkv_bias,
+            use_rope=use_rope,
+            qk_rms_norm=qk_rms_norm,
+            use_lora=use_lora_self,
+            lora_rank=lora_rank_self,
+            lora_lr_rate=lora_lr_rate,
+        )
+        self.cross_attn = MultiHeadAttention(
+            channels,
+            ctx_channels=ctx_channels,
+            num_heads=num_heads,
+            type="cross",
+            attn_mode="full",
+            qkv_bias=qkv_bias,
+            qk_rms_norm=qk_rms_norm_cross,
+            use_lora=use_lora_cross,
+            lora_rank=lora_rank_cross,
+            lora_lr_rate=lora_lr_rate,
+        )
+        self.mlp = FeedForwardNet(
+            channels,
+            mlp_ratio=mlp_ratio,
+        )
+        if not share_mod:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(channels, 6 * channels, bias=True)
+            )
+    def _forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.share_mod:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = mod.chunk(6, dim=1)
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(mod).chunk(6, dim=1)
+        h = self.norm1(x)
+        h = h * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1)
+        h = self.self_attn(h)
+        h = h * gate_msa.unsqueeze(1)
+        x = x + h
+        h = self.norm2(x)
+        if self.use_context_norm:
+            context = self.context_norm(context)
+        h = self.cross_attn(h, context)
+        x = x + h
+        h = self.norm3(x)
+        h = h * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1)
+        h = self.mlp(h)
+        h = h * gate_mlp.unsqueeze(1)
+        x = x + h
+        return x
+    def forward(self, x: torch.Tensor, mod: torch.Tensor, context: torch.Tensor):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, mod, context, use_reentrant=False)
+        else:
+            return self._forward(x, mod, context)

anigen/modules/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch.nn as nn
+from ..modules import sparse as sp
+FP16_MODULES = (
+    nn.Conv1d,
+    nn.Conv2d,
+    nn.Conv3d,
+    nn.ConvTranspose1d,
+    nn.ConvTranspose2d,
+    nn.ConvTranspose3d,
+    nn.Linear,
+    sp.SparseConv3d,
+    sp.SparseInverseConv3d,
+    sp.SparseLinear,
+)
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, FP16_MODULES):
+        for p in l.parameters():
+            p.data = p.data.float()
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)