SoulX-FlashHead

Running

App Files Files Community

JokerZhou commited on Mar 9

Commit

8bf6610

1 Parent(s): 9fd6c72

Upload files

Browse files

Files changed (44) hide show

.DS_Store +0 -0
LICENSE +201 -0
app.py +6 -0
download_model_from_hf.py +20 -0
flash_head/.DS_Store +0 -0
flash_head/audio_analysis/torch_utils.py +20 -0
flash_head/audio_analysis/wav2vec2.py +125 -0
flash_head/configs/infer_params.yaml +10 -0
flash_head/inference.py +77 -0
flash_head/ltx_video/.DS_Store +0 -0
flash_head/ltx_video/__init__.py +0 -0
flash_head/ltx_video/ltx_vae.py +42 -0
flash_head/ltx_video/models/__init__.py +0 -0
flash_head/ltx_video/models/autoencoders/__init__.py +0 -0
flash_head/ltx_video/models/autoencoders/causal_conv3d.py +63 -0
flash_head/ltx_video/models/autoencoders/causal_video_autoencoder.py +1412 -0
flash_head/ltx_video/models/autoencoders/conv_nd_factory.py +90 -0
flash_head/ltx_video/models/autoencoders/dual_conv3d.py +217 -0
flash_head/ltx_video/models/autoencoders/pixel_norm.py +12 -0
flash_head/ltx_video/models/autoencoders/vae.py +380 -0
flash_head/ltx_video/models/autoencoders/vae_encode.py +256 -0
flash_head/ltx_video/models/autoencoders/video_autoencoder.py +1045 -0
flash_head/ltx_video/models/transformers/__init__.py +0 -0
flash_head/ltx_video/models/transformers/attention.py +1265 -0
flash_head/ltx_video/models/transformers/embeddings.py +129 -0
flash_head/ltx_video/models/transformers/symmetric_patchifier.py +84 -0
flash_head/ltx_video/models/transformers/transformer3d.py +507 -0
flash_head/ltx_video/utils/__init__.py +0 -0
flash_head/ltx_video/utils/diffusers_config_mapping.py +174 -0
flash_head/ltx_video/utils/prompt_enhance_utils.py +226 -0
flash_head/ltx_video/utils/skip_layer_strategy.py +8 -0
flash_head/ltx_video/utils/torch_utils.py +25 -0
flash_head/src/.DS_Store +0 -0
flash_head/src/distributed/usp_device.py +35 -0
flash_head/src/modules/flash_head_model.py +548 -0
flash_head/src/pipeline/flash_head_pipeline.py +316 -0
flash_head/utils/cpu_face_handler.py +55 -0
flash_head/utils/facecrop.py +110 -0
flash_head/utils/utils.py +222 -0
flash_head/wan/modules/__init__.py +5 -0
flash_head/wan/modules/vae.py +1598 -0
generate_video.py +218 -0
gradio_app_streaming.py +339 -0
requirements.txt +23 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from gradio_app_streaming import app
+from download_model_from_hf import download_model
+if __name__ == "__main__":
+    download_model("Soul-AILab/SoulX-FlashHead-1_3B", "models")
+    app.launch(share=True)

download_model_from_hf.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pathlib import Path
+from huggingface_hub import snapshot_download
+def download_model(model_name, save_dir="models"):
+    # 创建保存目录
+    save_path = Path(save_dir) / model_name.split("/")[-1]
+    if save_path.exists():
+        print(f"✅ 模型已存在: {save_path}")
+        return str(save_path)
+    save_path.mkdir(parents=True, exist_ok=True)
+    download_path = snapshot_download(
+        repo_id=model_name,
+        local_dir=save_path,
+        local_dir_use_symlinks=False
+    )
+    print(f"✅ 下载完成: {download_path}")
+    return download_path

flash_head/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

flash_head/audio_analysis/torch_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn.functional as F
+def get_mask_from_lengths(lengths, max_len=None):
+    lengths = lengths.to(torch.long)
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+    ids = torch.arange(0, max_len).unsqueeze(0).expand(lengths.shape[0], -1).to(lengths.device)
+    mask = ids < lengths.unsqueeze(1).expand(-1, max_len)
+    return mask
+def linear_interpolation(features, seq_len):
+    features = features.transpose(1, 2)
+    output_features = F.interpolate(features, size=seq_len, align_corners=True, mode='linear')
+    return output_features.transpose(1, 2)

flash_head/audio_analysis/wav2vec2.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from transformers import Wav2Vec2Config, Wav2Vec2Model
+from transformers.modeling_outputs import BaseModelOutput
+from .torch_utils import linear_interpolation
+# the implementation of Wav2Vec2Model is borrowed from
+# https://github.com/huggingface/transformers/blob/HEAD/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+# initialize our encoder with the pre-trained wav2vec 2.0 weights.
+class Wav2Vec2Model(Wav2Vec2Model):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+    def forward(
+        self,
+        input_values,
+        seq_len,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        self.config.output_attentions = False
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    def feature_extract(
+        self,
+        input_values,
+        seq_len,
+    ):
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        extract_features = linear_interpolation(extract_features, seq_len=seq_len)
+        return extract_features
+    def encode(
+        self,
+        extract_features,
+        attention_mask=None,
+        mask_time_indices=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        self.config.output_attentions = False
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, ) + encoder_outputs[1:]
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

flash_head/configs/infer_params.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+frame_num: 33
+motion_frames_latent_num: 2
+tgt_fps: 25
+sample_rate: 16000
+sample_shift: 5
+color_correction_strength: 1.0
+cached_audio_duration: 8
+num_heads: 12
+height: 512
+width: 512

flash_head/inference.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import yaml
+import torch
+import copy
+from loguru import logger
+from flash_head.src.pipeline.flash_head_pipeline import FlashHeadPipeline
+from flash_head.src.distributed.usp_device import get_device, get_parallel_degree
+with open("flash_head/configs/infer_params.yaml", "r") as f:
+    infer_params = yaml.safe_load(f)
+def get_pipeline(world_size, ckpt_dir, model_type, wav2vec_dir):
+    global infer_params
+    ulysses_degree, ring_degree = get_parallel_degree(world_size, infer_params['num_heads'])
+    device = get_device(ulysses_degree, ring_degree)
+    logger.info(f"ulysses_degree: {ulysses_degree}, ring_degree: {ring_degree}, device: {device}")
+    pipeline = FlashHeadPipeline(
+        checkpoint_dir=ckpt_dir,
+        model_type=model_type,
+        wav2vec_dir=wav2vec_dir,
+        device=device,
+        use_usp=(world_size > 1),
+    )
+    # compute motion_frames_num
+    motion_frames_latent_num = infer_params['motion_frames_latent_num']
+    motion_frames_num = (motion_frames_latent_num - 1) * pipeline.config.vae_stride[0] + 1
+    infer_params['motion_frames_num'] = motion_frames_num
+    # TODO: move to args
+    if model_type == "pretrained":
+        infer_params['sample_steps'] = 20
+    else:
+        infer_params['sample_steps'] = 4
+    return pipeline
+def get_base_data(pipeline, cond_image_path_or_dir, base_seed, use_face_crop):
+    pipeline.prepare_params(
+        cond_image_path_or_dir=cond_image_path_or_dir,
+        target_size=(infer_params['height'], infer_params['width']),
+        frame_num=infer_params['frame_num'],
+        motion_frames_num=infer_params['motion_frames_num'],
+        sampling_steps=infer_params['sample_steps'],
+        seed=base_seed,
+        shift=infer_params['sample_shift'],
+        color_correction_strength=infer_params['color_correction_strength'],
+        use_face_crop=use_face_crop,
+    )
+def get_infer_params():
+    global infer_params
+    return copy.deepcopy(infer_params)
+def get_audio_embedding(pipeline, audio_array, audio_start_idx=-1, audio_end_idx=-1):
+    # audio_array = loudness_norm(audio_array, infer_params['sample_rate'])
+    audio_embedding = pipeline.preprocess_audio(audio_array, sr=infer_params['sample_rate'], fps=infer_params['tgt_fps'])
+    if audio_start_idx == -1 or audio_end_idx == -1:
+        audio_start_idx = 0
+        audio_end_idx = audio_embedding.shape[0]
+    indices = (torch.arange(2 * 2 + 1) - 2) * 1
+    center_indices = torch.arange(audio_start_idx, audio_end_idx, 1).unsqueeze(1) + indices.unsqueeze(0)
+    center_indices = torch.clamp(center_indices, min=0, max=audio_end_idx-1)
+    audio_embedding = audio_embedding[center_indices][None,...].contiguous()
+    return audio_embedding
+def run_pipeline(pipeline, audio_embedding):
+    audio_embedding = audio_embedding.to(pipeline.device)
+    sample = pipeline.generate(audio_embedding)
+    sample_frames = (((sample+1)/2).permute(1,2,3,0).clip(0,1) * 255).contiguous()
+    return sample_frames

flash_head/ltx_video/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

flash_head/ltx_video/__init__.py ADDED Viewed

File without changes

flash_head/ltx_video/ltx_vae.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from flash_head.ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+class LtxVAE:
+    def __init__(
+        self,
+        pretrained_model_type_or_path,
+        dtype = torch.bfloat16,
+        device = "cuda",
+    ):
+        self.model = CausalVideoAutoencoder.from_pretrained(pretrained_model_type_or_path)
+        self.model = self.model.eval().requires_grad_(False).to(device).to(dtype)
+    # torch.Size([1, 3, 33, 512, 512]) -> torch.Size([128, 5, 16, 16])
+    def encode(self, video):
+        latents = self.model.encode(video, return_dict=False)[0].sample()
+        out = self.normalize_latents(latents)
+        return out[0]
+    # torch.Size([128, 5, 16, 16]) -> torch.Size([1, 3, 33, 512, 512])
+    def decode(self, zs):
+        latents = zs.unsqueeze(0)
+        image = self.model.decode(
+            self.un_normalize_latents(latents),
+            return_dict=False,
+            target_shape=latents.shape,
+        )[0]
+        return image
+    def normalize_latents(self, latents):
+        return (
+            (latents - self.model.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1))
+            / self.model.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+        )
+    def un_normalize_latents(self,latents):
+        return (
+            latents * self.model.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+            + self.model.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+        )

flash_head/ltx_video/models/__init__.py ADDED Viewed

File without changes

flash_head/ltx_video/models/autoencoders/__init__.py ADDED Viewed

File without changes

flash_head/ltx_video/models/autoencoders/causal_conv3d.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size: int = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        spatial_padding_mode: str = "zeros",
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.time_kernel_size = kernel_size[0]
+        dilation = (dilation, 1, 1)
+        height_pad = kernel_size[1] // 2
+        width_pad = kernel_size[2] // 2
+        padding = (0, height_pad, width_pad)
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            padding_mode=spatial_padding_mode,
+            groups=groups,
+        )
+    def forward(self, x, causal: bool = True):
+        if causal:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, self.time_kernel_size - 1, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x), dim=2)
+        else:
+            first_frame_pad = x[:, :, :1, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            last_frame_pad = x[:, :, -1:, :, :].repeat(
+                (1, 1, (self.time_kernel_size - 1) // 2, 1, 1)
+            )
+            x = torch.concatenate((first_frame_pad, x, last_frame_pad), dim=2)
+        x = self.conv(x)
+        return x
+    @property
+    def weight(self):
+        return self.conv.weight

flash_head/ltx_video/models/autoencoders/causal_video_autoencoder.py ADDED Viewed

	@@ -0,0 +1,1412 @@

+import json
+import os
+from functools import partial
+from types import SimpleNamespace
+from typing import Any, Mapping, Optional, Tuple, Union, List
+from pathlib import Path
+import torch
+import numpy as np
+from einops import rearrange
+from torch import nn
+from diffusers.utils import logging
+import torch.nn.functional as F
+from diffusers.models.embeddings import PixArtAlphaCombinedTimestepSizeEmbeddings
+from safetensors import safe_open
+from flash_head.ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
+from flash_head.ltx_video.models.autoencoders.pixel_norm import PixelNorm
+from flash_head.ltx_video.models.autoencoders.vae import AutoencoderKLWrapper
+from flash_head.ltx_video.models.transformers.attention import Attention
+from flash_head.ltx_video.utils.diffusers_config_mapping import (
+    diffusers_and_ours_config_mapping,
+    make_hashable_key,
+    VAE_KEYS_RENAME_DICT,
+)
+PER_CHANNEL_STATISTICS_PREFIX = "per_channel_statistics."
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class CausalVideoAutoencoder(AutoencoderKLWrapper):
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *args,
+        **kwargs,
+    ):
+        pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
+        if (
+            pretrained_model_name_or_path.is_dir()
+            and (pretrained_model_name_or_path / "autoencoder.pth").exists()
+        ):
+            config_local_path = pretrained_model_name_or_path / "config.json"
+            config = cls.load_config(config_local_path, **kwargs)
+            model_local_path = pretrained_model_name_or_path / "autoencoder.pth"
+            state_dict = torch.load(model_local_path, map_location=torch.device("cpu"))
+            statistics_local_path = (
+                pretrained_model_name_or_path / "per_channel_statistics.json"
+            )
+            if statistics_local_path.exists():
+                with open(statistics_local_path, "r") as file:
+                    data = json.load(file)
+                transposed_data = list(zip(*data["data"]))
+                data_dict = {
+                    col: torch.tensor(vals)
+                    for col, vals in zip(data["columns"], transposed_data)
+                }
+                std_of_means = data_dict["std-of-means"]
+                mean_of_means = data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                )
+                state_dict[f"{PER_CHANNEL_STATISTICS_PREFIX}std-of-means"] = (
+                    std_of_means
+                )
+                state_dict[f"{PER_CHANNEL_STATISTICS_PREFIX}mean-of-means"] = (
+                    mean_of_means
+                )
+        elif pretrained_model_name_or_path.is_dir():
+            config_path = pretrained_model_name_or_path / "config.json"
+            with open(config_path, "r") as f:
+                config = make_hashable_key(json.load(f))
+            assert config in diffusers_and_ours_config_mapping, (
+                "Provided diffusers checkpoint config for VAE is not suppported. "
+                "We only support diffusers configs found in Lightricks/LTX-Video."
+            )
+            config = diffusers_and_ours_config_mapping[config]
+            state_dict_path = (
+                pretrained_model_name_or_path
+                / "diffusion_pytorch_model.safetensors"
+            )
+            state_dict = {}
+            with safe_open(state_dict_path, framework="pt", device="cpu") as f:
+                for k in f.keys():
+                    state_dict[k] = f.get_tensor(k)
+            for key in list(state_dict.keys()):
+                new_key = key
+                for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+                    new_key = new_key.replace(replace_key, rename_key)
+                state_dict[new_key] = state_dict.pop(key)
+        elif pretrained_model_name_or_path.is_file() and str(
+            pretrained_model_name_or_path
+        ).endswith(".safetensors"):
+            state_dict = {}
+            with safe_open(
+                pretrained_model_name_or_path, framework="pt", device="cpu"
+            ) as f:
+                metadata = f.metadata()
+                for k in f.keys():
+                    state_dict[k] = f.get_tensor(k)
+            configs = json.loads(metadata["config"])
+            config = configs["vae"]
+        video_vae = cls.from_config(config)
+        if "torch_dtype" in kwargs:
+            video_vae.to(kwargs["torch_dtype"])
+        video_vae.load_state_dict(state_dict)
+        return video_vae
+    @staticmethod
+    def from_config(config):
+        assert (
+            config["_class_name"] == "CausalVideoAutoencoder"
+        ), "config must have _class_name=CausalVideoAutoencoder"
+        if isinstance(config["dims"], list):
+            config["dims"] = tuple(config["dims"])
+        assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+        use_quant_conv = config.get("use_quant_conv", True)
+        normalize_latent_channels = config.get("normalize_latent_channels", False)
+        if use_quant_conv and latent_log_var in ["uniform", "constant"]:
+            raise ValueError(
+                f"latent_log_var={latent_log_var} requires use_quant_conv=False"
+            )
+        encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("blocks")),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+            base_channels=config.get("encoder_base_channels", 128),
+            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+        )
+        decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("blocks")),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
+            base_channels=config.get("decoder_base_channels", 128),
+            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+        )
+        dims = config["dims"]
+        return CausalVideoAutoencoder(
+            encoder=encoder,
+            decoder=decoder,
+            latent_channels=config["latent_channels"],
+            dims=dims,
+            use_quant_conv=use_quant_conv,
+            normalize_latent_channels=normalize_latent_channels,
+        )
+    @property
+    def config(self):
+        return SimpleNamespace(
+            _class_name="CausalVideoAutoencoder",
+            dims=self.dims,
+            in_channels=self.encoder.conv_in.in_channels // self.encoder.patch_size**2,
+            out_channels=self.decoder.conv_out.out_channels
+            // self.decoder.patch_size**2,
+            latent_channels=self.decoder.conv_in.in_channels,
+            encoder_blocks=self.encoder.blocks_desc,
+            decoder_blocks=self.decoder.blocks_desc,
+            scaling_factor=1.0,
+            norm_layer=self.encoder.norm_layer,
+            patch_size=self.encoder.patch_size,
+            latent_log_var=self.encoder.latent_log_var,
+            use_quant_conv=self.use_quant_conv,
+            causal_decoder=self.decoder.causal,
+            timestep_conditioning=self.decoder.timestep_conditioning,
+            normalize_latent_channels=self.normalize_latent_channels,
+        )
+    @property
+    def is_video_supported(self):
+        """
+        Check if the model supports video inputs of shape (B, C, F, H, W). Otherwise, the model only supports 2D images.
+        """
+        return self.dims != 2
+    @property
+    def spatial_downscale_factor(self):
+        return (
+            2
+            ** len(
+                [
+                    block
+                    for block in self.encoder.blocks_desc
+                    if block[0]
+                    in [
+                        "compress_space",
+                        "compress_all",
+                        "compress_all_res",
+                        "compress_space_res",
+                    ]
+                ]
+            )
+            * self.encoder.patch_size
+        )
+    @property
+    def temporal_downscale_factor(self):
+        return 2 ** len(
+            [
+                block
+                for block in self.encoder.blocks_desc
+                if block[0]
+                in [
+                    "compress_time",
+                    "compress_all",
+                    "compress_all_res",
+                    "compress_space_res",
+                ]
+            ]
+        )
+    def to_json_string(self) -> str:
+        import json
+        return json.dumps(self.config.__dict__)
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        if any([key.startswith("vae.") for key in state_dict.keys()]):
+            state_dict = {
+                key.replace("vae.", ""): value
+                for key, value in state_dict.items()
+                if key.startswith("vae.")
+            }
+        ckpt_state_dict = {
+            key: value
+            for key, value in state_dict.items()
+            if not key.startswith(PER_CHANNEL_STATISTICS_PREFIX)
+        }
+        model_keys = set(name for name, _ in self.named_modules())
+        key_mapping = {
+            ".resnets.": ".res_blocks.",
+            "downsamplers.0": "downsample",
+            "upsamplers.0": "upsample",
+        }
+        converted_state_dict = {}
+        for key, value in ckpt_state_dict.items():
+            for k, v in key_mapping.items():
+                key = key.replace(k, v)
+            key_prefix = ".".join(key.split(".")[:-1])
+            if "norm" in key and key_prefix not in model_keys:
+                logger.info(
+                    f"Removing key {key} from state_dict as it is not present in the model"
+                )
+                continue
+            converted_state_dict[key] = value
+        super().load_state_dict(converted_state_dict, strict=strict)
+        data_dict = {
+            key.removeprefix(PER_CHANNEL_STATISTICS_PREFIX): value
+            for key, value in state_dict.items()
+            if key.startswith(PER_CHANNEL_STATISTICS_PREFIX)
+        }
+        if len(data_dict) > 0:
+            self.register_buffer("std_of_means", data_dict["std-of-means"])
+            self.register_buffer(
+                "mean_of_means",
+                data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                ),
+            )
+    def last_layer(self):
+        if hasattr(self.decoder, "conv_out"):
+            if isinstance(self.decoder.conv_out, nn.Sequential):
+                last_layer = self.decoder.conv_out[-1]
+            else:
+                last_layer = self.decoder.conv_out
+        else:
+            last_layer = self.decoder.layers[-1]
+        return last_layer
+    def set_use_tpu_flash_attention(self):
+        for block in self.decoder.up_blocks:
+            if isinstance(block, UNetMidBlock3D) and block.attention_blocks:
+                for attention_block in block.attention_blocks:
+                    attention_block.set_use_tpu_flash_attention()
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        base_channels: int = 128,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+        spatial_padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        self.blocks_desc = blocks
+        in_channels = in_channels * patch_size**2
+        output_channel = base_channels
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        self.down_blocks = nn.ModuleList([])
+        for block_name, block_params in blocks:
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "res_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_time":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 1, 1),
+                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_space":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(1, 2, 2),
+                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_all":
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_all_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_all_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(2, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_space_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(1, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_time_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(2, 1, 1),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            else:
+                raise ValueError(f"unknown block: {block_name}")
+            self.down_blocks.append(block)
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var == "constant":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims,
+            output_channel,
+            conv_out_channels,
+            3,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        sample = patchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        sample = self.conv_in(sample)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(sample)
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+        elif self.latent_log_var == "constant":
+            sample = sample[:, :-1, ...]
+            approx_ln_0 = (
+                -30
+            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
+            sample = torch.cat(
+                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
+                dim=1,
+            )
+        return sample
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        dims (`int` or `Tuple[int, int]`, *optional*, defaults to 3):
+            The number of dimensions to use in convolutions.
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        blocks (`List[Tuple[str, int]]`, *optional*, defaults to `[("res_x", 1)]`):
+            The blocks to use. Each block is a tuple of the block name and the number of layers.
+        base_channels (`int`, *optional*, defaults to 128):
+            The number of output channels for the first convolutional layer.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        causal (`bool`, *optional*, defaults to `True`):
+            Whether to use causal convolutions or not.
+    """
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
+        base_channels: int = 128,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        causal: bool = True,
+        timestep_conditioning: bool = False,
+        spatial_padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.layers_per_block = layers_per_block
+        out_channels = out_channels * patch_size**2
+        self.causal = causal
+        self.blocks_desc = blocks
+        # Compute output channel to be product of all channel-multiplier blocks
+        output_channel = base_channels
+        for block_name, block_params in list(reversed(blocks)):
+            block_params = block_params if isinstance(block_params, dict) else {}
+            if block_name == "res_x_y":
+                output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name == "compress_all":
+                output_channel = output_channel * block_params.get("multiplier", 1)
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        self.up_blocks = nn.ModuleList([])
+        for block_name, block_params in list(reversed(blocks)):
+            input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
+            if block_name == "res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_eps=1e-6,
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "attn_res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    attention_head_dim=block_params["attention_head_dim"],
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "res_x_y":
+                output_channel = output_channel // block_params.get("multiplier", 2)
+                block = ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    eps=1e-6,
+                    groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=False,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_time":
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 1, 1),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_space":
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(1, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_all":
+                output_channel = output_channel // block_params.get("multiplier", 1)
+                block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 2, 2),
+                    residual=block_params.get("residual", False),
+                    out_channels_reduction_factor=block_params.get("multiplier", 1),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            else:
+                raise ValueError(f"unknown layer: {block_name}")
+            self.up_blocks.append(block)
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=output_channel, num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.conv_norm_out = LayerNorm(output_channel, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims,
+            output_channel,
+            out_channels,
+            3,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        self.gradient_checkpointing = False
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(
+                torch.tensor(1000.0, dtype=torch.float32)
+            )
+            self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                output_channel * 2, 0
+            )
+            self.last_scale_shift_table = nn.Parameter(
+                torch.randn(2, output_channel) / output_channel**0.5
+            )
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        target_shape,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        assert target_shape is not None, "target_shape must be provided"
+        batch_size = sample.shape[0]
+        sample = self.conv_in(sample, causal=self.causal)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        sample = sample.to(upscale_dtype)
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            scaled_timestep = timestep * self.timestep_scale_multiplier
+        for up_block in self.up_blocks:
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
+        sample = self.conv_norm_out(sample)
+        if self.timestep_conditioning:
+            embedded_timestep = self.last_time_embedder(
+                timestep=scaled_timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=sample.shape[0],
+                hidden_dtype=sample.dtype,
+            )
+            embedded_timestep = embedded_timestep.view(
+                batch_size, embedded_timestep.shape[-1], 1, 1, 1
+            )
+            ada_values = self.last_scale_shift_table[
+                None, ..., None, None, None
+            ] + embedded_timestep.reshape(
+                batch_size,
+                2,
+                -1,
+                embedded_timestep.shape[-3],
+                embedded_timestep.shape[-2],
+                embedded_timestep.shape[-1],
+            )
+            shift, scale = ada_values.unbind(dim=1)
+            sample = sample * (1 + scale) + shift
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample, causal=self.causal)
+        sample = unpatchify(sample, patch_size_hw=self.patch_size, patch_size_t=1)
+        return sample
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        inject_noise (`bool`, *optional*, defaults to `False`):
+            Whether to inject noise into the hidden states.
+        timestep_conditioning (`bool`, *optional*, defaults to `False`):
+            Whether to condition the hidden states on the timestep.
+        attention_head_dim (`int`, *optional*, defaults to -1):
+            The dimension of the attention head. If -1, no attention is used.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+        attention_head_dim: int = -1,
+        spatial_padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                in_channels * 4, 0
+            )
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                    inject_noise=inject_noise,
+                    timestep_conditioning=timestep_conditioning,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.attention_blocks = None
+        if attention_head_dim > 0:
+            if attention_head_dim > in_channels:
+                raise ValueError(
+                    "attention_head_dim must be less than or equal to in_channels"
+                )
+            self.attention_blocks = nn.ModuleList(
+                [
+                    Attention(
+                        query_dim=in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        bias=True,
+                        out_bias=True,
+                        qk_norm="rms_norm",
+                        residual_connection=True,
+                    )
+                    for _ in range(num_layers)
+                ]
+            )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        timestep_embed = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            batch_size = hidden_states.shape[0]
+            timestep_embed = self.time_embedder(
+                timestep=timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=batch_size,
+                hidden_dtype=hidden_states.dtype,
+            )
+            timestep_embed = timestep_embed.view(
+                batch_size, timestep_embed.shape[-1], 1, 1, 1
+            )
+        if self.attention_blocks:
+            for resnet, attention in zip(self.res_blocks, self.attention_blocks):
+                hidden_states = resnet(
+                    hidden_states, causal=causal, timestep=timestep_embed
+                )
+                # Reshape the hidden states to be (batch_size, frames * height * width, channel)
+                batch_size, channel, frames, height, width = hidden_states.shape
+                hidden_states = hidden_states.view(
+                    batch_size, channel, frames * height * width
+                ).transpose(1, 2)
+                if attention.use_tpu_flash_attention:
+                    # Pad the second dimension to be divisible by block_k_major (block in flash attention)
+                    seq_len = hidden_states.shape[1]
+                    block_k_major = 512
+                    pad_len = (block_k_major - seq_len % block_k_major) % block_k_major
+                    if pad_len > 0:
+                        hidden_states = F.pad(
+                            hidden_states, (0, 0, 0, pad_len), "constant", 0
+                        )
+                    # Create a mask with ones for the original sequence length and zeros for the padded indexes
+                    mask = torch.ones(
+                        (hidden_states.shape[0], seq_len),
+                        device=hidden_states.device,
+                        dtype=hidden_states.dtype,
+                    )
+                    if pad_len > 0:
+                        mask = F.pad(mask, (0, pad_len), "constant", 0)
+                hidden_states = attention(
+                    hidden_states,
+                    attention_mask=(
+                        None if not attention.use_tpu_flash_attention else mask
+                    ),
+                )
+                if attention.use_tpu_flash_attention:
+                    # Remove the padding
+                    if pad_len > 0:
+                        hidden_states = hidden_states[:, :-pad_len, :]
+                # Reshape the hidden states back to (batch_size, channel, frames, height, width, channel)
+                hidden_states = hidden_states.transpose(-1, -2).reshape(
+                    batch_size, channel, frames, height, width
+                )
+        else:
+            for resnet in self.res_blocks:
+                hidden_states = resnet(
+                    hidden_states, causal=causal, timestep=timestep_embed
+                )
+        return hidden_states
+class SpaceToDepthDownsample(nn.Module):
+    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
+        super().__init__()
+        self.stride = stride
+        self.group_size = in_channels * np.prod(stride) // out_channels
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=out_channels // np.prod(stride),
+            kernel_size=3,
+            stride=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+    def forward(self, x, causal: bool = True):
+        if self.stride[0] == 2:
+            x = torch.cat(
+                [x[:, :, :1, :, :], x], dim=2
+            )  # duplicate first frames for padding
+        # skip connection
+        x_in = rearrange(
+            x,
+            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
+        x_in = x_in.mean(dim=2)
+        # conv
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        x = x + x_in
+        return x
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self,
+        dims,
+        in_channels,
+        stride,
+        residual=False,
+        out_channels_reduction_factor=1,
+        spatial_padding_mode="zeros",
+    ):
+        super().__init__()
+        self.stride = stride
+        self.out_channels = (
+            np.prod(stride) * in_channels // out_channels_reduction_factor
+        )
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        self.residual = residual
+        self.out_channels_reduction_factor = out_channels_reduction_factor
+    def forward(self, x, causal: bool = True):
+        if self.residual:
+            # Reshape and duplicate the input to match the output shape
+            x_in = rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.stride[0],
+                p2=self.stride[1],
+                p3=self.stride[2],
+            )
+            num_repeat = np.prod(self.stride) // self.out_channels_reduction_factor
+            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
+            if self.stride[0] == 2:
+                x_in = x_in[:, :, 1:, :, :]
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        if self.stride[0] == 2:
+            x = x[:, :, 1:, :, :]
+        if self.residual:
+            x = x + x_in
+        return x
+class LayerNorm(nn.Module):
+    def __init__(self, dim, eps, elementwise_affine=True) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(self, x):
+        x = rearrange(x, "b c d h w -> b d h w c")
+        x = self.norm(x)
+        x = rearrange(x, "b d h w c -> b c d h w")
+        return x
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+        spatial_padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.inject_noise = inject_noise
+        if norm_layer == "group_norm":
+            self.norm1 = nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm1 = LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+        self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims,
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        if inject_noise:
+            self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        if norm_layer == "group_norm":
+            self.norm2 = nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        elif norm_layer == "layer_norm":
+            self.norm2 = LayerNorm(out_channels, eps=eps, elementwise_affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims,
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+        if inject_noise:
+            self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+        self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.norm3 = (
+            LayerNorm(in_channels, eps=eps, elementwise_affine=True)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(4, in_channels) / in_channels**0.5
+            )
+    def _feed_spatial_noise(
+        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        spatial_shape = hidden_states.shape[-2:]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+        # similar to the "explicit noise inputs" method in style-gan
+        spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+        scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+        hidden_states = hidden_states + scaled_noise
+        return hidden_states
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        batch_size = hidden_states.shape[0]
+        hidden_states = self.norm1(hidden_states)
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            ada_values = self.scale_shift_table[
+                None, ..., None, None, None
+            ] + timestep.reshape(
+                batch_size,
+                4,
+                -1,
+                timestep.shape[-3],
+                timestep.shape[-2],
+                timestep.shape[-1],
+            )
+            shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+            hidden_states = hidden_states * (1 + scale1) + shift1
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.conv1(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale1
+            )
+        hidden_states = self.norm2(hidden_states)
+        if self.timestep_conditioning:
+            hidden_states = hidden_states * (1 + scale2) + shift2
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states, causal=causal)
+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale2
+            )
+        input_tensor = self.norm3(input_tensor)
+        batch_size = input_tensor.shape[0]
+        input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+def patchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size_hw, patch_size_t=1):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    return x
+def create_video_autoencoder_demo_config(
+    latent_channels: int = 64,
+):
+    encoder_blocks = [
+        ("res_x", {"num_layers": 2}),
+        ("compress_space_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 2}),
+        ("compress_time_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 1}),
+        ("compress_all_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 1}),
+        ("compress_all_res", {"multiplier": 2}),
+        ("res_x", {"num_layers": 1}),
+    ]
+    decoder_blocks = [
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
+        ("compress_all", {"residual": True, "multiplier": 2}),
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
+        ("compress_all", {"residual": True, "multiplier": 2}),
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
+        ("compress_all", {"residual": True, "multiplier": 2}),
+        ("res_x", {"num_layers": 2, "inject_noise": False}),
+    ]
+    return {
+        "_class_name": "CausalVideoAutoencoder",
+        "dims": 3,
+        "encoder_blocks": encoder_blocks,
+        "decoder_blocks": decoder_blocks,
+        "latent_channels": latent_channels,
+        "norm_layer": "pixel_norm",
+        "patch_size": 4,
+        "latent_log_var": "uniform",
+        "use_quant_conv": False,
+        "causal_decoder": False,
+        "timestep_conditioning": True,
+        "spatial_padding_mode": "replicate",
+    }
+def test_vae_patchify_unpatchify():
+    import torch
+    x = torch.randn(2, 3, 8, 64, 64)
+    x_patched = patchify(x, patch_size_hw=4, patch_size_t=4)
+    x_unpatched = unpatchify(x_patched, patch_size_hw=4, patch_size_t=4)
+    assert torch.allclose(x, x_unpatched)
+def demo_video_autoencoder_forward_backward():
+    # Configuration for the VideoAutoencoder
+    config = create_video_autoencoder_demo_config()
+    # Instantiate the VideoAutoencoder with the specified configuration
+    video_autoencoder = CausalVideoAutoencoder.from_config(config)
+    print(video_autoencoder)
+    video_autoencoder.eval()
+    # Print the total number of parameters in the video autoencoder
+    total_params = sum(p.numel() for p in video_autoencoder.parameters())
+    print(f"Total number of parameters in VideoAutoencoder: {total_params:,}")
+    # Create a mock input tensor simulating a batch of videos
+    # Shape: (batch_size, channels, depth, height, width)
+    # E.g., 4 videos, each with 3 color channels, 16 frames, and 64x64 pixels per frame
+    input_videos = torch.randn(2, 3, 17, 64, 64)
+    # Forward pass: encode and decode the input videos
+    latent = video_autoencoder.encode(input_videos).latent_dist.mode()
+    print(f"input shape={input_videos.shape}")
+    print(f"latent shape={latent.shape}")
+    timestep = torch.ones(input_videos.shape[0]) * 0.1
+    reconstructed_videos = video_autoencoder.decode(
+        latent, target_shape=input_videos.shape, timestep=timestep
+    ).sample
+    print(f"reconstructed shape={reconstructed_videos.shape}")
+    # Validate that single image gets treated the same way as first frame
+    input_image = input_videos[:, :, :1, :, :]
+    image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
+    _ = video_autoencoder.decode(
+        image_latent, target_shape=image_latent.shape, timestep=timestep
+    ).sample
+    first_frame_latent = latent[:, :, :1, :, :]
+    assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
+    # assert torch.allclose(reconstructed_image, reconstructed_videos[:, :, :1, :, :], atol=1e-6)
+    # assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
+    # assert (reconstructed_image == reconstructed_videos[:, :, :1, :, :]).all()
+    # Calculate the loss (e.g., mean squared error)
+    loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)
+    # Perform backward pass
+    loss.backward()
+    print(f"Demo completed with loss: {loss.item()}")
+# Ensure to call the demo function to execute the forward and backward pass
+if __name__ == "__main__":
+    demo_video_autoencoder_forward_backward()

flash_head/ltx_video/models/autoencoders/conv_nd_factory.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Tuple, Union
+import torch
+from flash_head.ltx_video.models.autoencoders.dual_conv3d import DualConv3d
+from flash_head.ltx_video.models.autoencoders.causal_conv3d import CausalConv3d
+def make_conv_nd(
+    dims: Union[int, Tuple[int, int]],
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    bias=True,
+    causal=False,
+    spatial_padding_mode="zeros",
+    temporal_padding_mode="zeros",
+):
+    if not (spatial_padding_mode == temporal_padding_mode or causal):
+        raise NotImplementedError("spatial and temporal padding modes must be equal")
+    if dims == 2:
+        return torch.nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=spatial_padding_mode,
+        )
+    elif dims == 3:
+        if causal:
+            return CausalConv3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+                spatial_padding_mode=spatial_padding_mode,
+            )
+        return torch.nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=spatial_padding_mode,
+        )
+    elif dims == (2, 1):
+        return DualConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+            padding_mode=spatial_padding_mode,
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")
+def make_linear_nd(
+    dims: int,
+    in_channels: int,
+    out_channels: int,
+    bias=True,
+):
+    if dims == 2:
+        return torch.nn.Conv2d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    elif dims == 3 or dims == (2, 1):
+        return torch.nn.Conv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias
+        )
+    else:
+        raise ValueError(f"unsupported dimensions: {dims}")

flash_head/ltx_video/models/autoencoders/dual_conv3d.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import math
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class DualConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+    ):
+        super(DualConv3d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.padding_mode = padding_mode
+        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        if kernel_size == (1, 1, 1):
+            raise ValueError(
+                "kernel_size must be greater than 1. Use make_linear_nd instead."
+            )
+        if isinstance(stride, int):
+            stride = (stride, stride, stride)
+        if isinstance(padding, int):
+            padding = (padding, padding, padding)
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation, dilation)
+        # Set parameters for convolutions
+        self.groups = groups
+        self.bias = bias
+        # Define the size of the channels after the first convolution
+        intermediate_channels = (
+            out_channels if in_channels < out_channels else in_channels
+        )
+        # Define parameters for the first convolution
+        self.weight1 = nn.Parameter(
+            torch.Tensor(
+                intermediate_channels,
+                in_channels // groups,
+                1,
+                kernel_size[1],
+                kernel_size[2],
+            )
+        )
+        self.stride1 = (1, stride[1], stride[2])
+        self.padding1 = (0, padding[1], padding[2])
+        self.dilation1 = (1, dilation[1], dilation[2])
+        if bias:
+            self.bias1 = nn.Parameter(torch.Tensor(intermediate_channels))
+        else:
+            self.register_parameter("bias1", None)
+        # Define parameters for the second convolution
+        self.weight2 = nn.Parameter(
+            torch.Tensor(
+                out_channels, intermediate_channels // groups, kernel_size[0], 1, 1
+            )
+        )
+        self.stride2 = (stride[0], 1, 1)
+        self.padding2 = (padding[0], 0, 0)
+        self.dilation2 = (dilation[0], 1, 1)
+        if bias:
+            self.bias2 = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter("bias2", None)
+        # Initialize weights and biases
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
+        nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
+        if self.bias:
+            fan_in1, _ = nn.init._calculate_fan_in_and_fan_out(self.weight1)
+            bound1 = 1 / math.sqrt(fan_in1)
+            nn.init.uniform_(self.bias1, -bound1, bound1)
+            fan_in2, _ = nn.init._calculate_fan_in_and_fan_out(self.weight2)
+            bound2 = 1 / math.sqrt(fan_in2)
+            nn.init.uniform_(self.bias2, -bound2, bound2)
+    def forward(self, x, use_conv3d=False, skip_time_conv=False):
+        if use_conv3d:
+            return self.forward_with_3d(x=x, skip_time_conv=skip_time_conv)
+        else:
+            return self.forward_with_2d(x=x, skip_time_conv=skip_time_conv)
+    def forward_with_3d(self, x, skip_time_conv):
+        # First convolution
+        x = F.conv3d(
+            x,
+            self.weight1,
+            self.bias1,
+            self.stride1,
+            self.padding1,
+            self.dilation1,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )
+        if skip_time_conv:
+            return x
+        # Second convolution
+        x = F.conv3d(
+            x,
+            self.weight2,
+            self.bias2,
+            self.stride2,
+            self.padding2,
+            self.dilation2,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )
+        return x
+    def forward_with_2d(self, x, skip_time_conv):
+        b, c, d, h, w = x.shape
+        # First 2D convolution
+        x = rearrange(x, "b c d h w -> (b d) c h w")
+        # Squeeze the depth dimension out of weight1 since it's 1
+        weight1 = self.weight1.squeeze(2)
+        # Select stride, padding, and dilation for the 2D convolution
+        stride1 = (self.stride1[1], self.stride1[2])
+        padding1 = (self.padding1[1], self.padding1[2])
+        dilation1 = (self.dilation1[1], self.dilation1[2])
+        x = F.conv2d(
+            x,
+            weight1,
+            self.bias1,
+            stride1,
+            padding1,
+            dilation1,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )
+        _, _, h, w = x.shape
+        if skip_time_conv:
+            x = rearrange(x, "(b d) c h w -> b c d h w", b=b)
+            return x
+        # Second convolution which is essentially treated as a 1D convolution across the 'd' dimension
+        x = rearrange(x, "(b d) c h w -> (b h w) c d", b=b)
+        # Reshape weight2 to match the expected dimensions for conv1d
+        weight2 = self.weight2.squeeze(-1).squeeze(-1)
+        # Use only the relevant dimension for stride, padding, and dilation for the 1D convolution
+        stride2 = self.stride2[0]
+        padding2 = self.padding2[0]
+        dilation2 = self.dilation2[0]
+        x = F.conv1d(
+            x,
+            weight2,
+            self.bias2,
+            stride2,
+            padding2,
+            dilation2,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )
+        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)
+        return x
+    @property
+    def weight(self):
+        return self.weight2
+def test_dual_conv3d_consistency():
+    # Initialize parameters
+    in_channels = 3
+    out_channels = 5
+    kernel_size = (3, 3, 3)
+    stride = (2, 2, 2)
+    padding = (1, 1, 1)
+    # Create an instance of the DualConv3d class
+    dual_conv3d = DualConv3d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        bias=True,
+    )
+    # Example input tensor
+    test_input = torch.randn(1, 3, 10, 10, 10)
+    # Perform forward passes with both 3D and 2D settings
+    output_conv3d = dual_conv3d(test_input, use_conv3d=True)
+    output_2d = dual_conv3d(test_input, use_conv3d=False)
+    # Assert that the outputs from both methods are sufficiently close
+    assert torch.allclose(
+        output_conv3d, output_2d, atol=1e-6
+    ), "Outputs are not consistent between 3D and 2D convolutions."

flash_head/ltx_video/models/autoencoders/pixel_norm.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torch
+from torch import nn
+class PixelNorm(nn.Module):
+    def __init__(self, dim=1, eps=1e-8):
+        super(PixelNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+    def forward(self, x):
+        return x / torch.sqrt(torch.mean(x**2, dim=self.dim, keepdim=True) + self.eps)

flash_head/ltx_video/models/autoencoders/vae.py ADDED Viewed

	@@ -0,0 +1,380 @@

+from typing import Optional, Union
+import torch
+import inspect
+import math
+import torch.nn as nn
+from diffusers import ConfigMixin, ModelMixin
+from diffusers.models.autoencoders.vae import (
+    DecoderOutput,
+    DiagonalGaussianDistribution,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from flash_head.ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd
+class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
+    """Variational Autoencoder (VAE) model with KL loss.
+    VAE from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma and Max Welling.
+    This model is a wrapper around an encoder and a decoder, and it adds a KL loss term to the reconstruction loss.
+    Args:
+        encoder (`nn.Module`):
+            Encoder module.
+        decoder (`nn.Module`):
+            Decoder module.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Number of latent channels.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        decoder: nn.Module,
+        latent_channels: int = 4,
+        dims: int = 2,
+        sample_size=512,
+        use_quant_conv: bool = True,
+        normalize_latent_channels: bool = False,
+    ):
+        super().__init__()
+        # pass init params to Encoder
+        self.encoder = encoder
+        self.use_quant_conv = use_quant_conv
+        self.normalize_latent_channels = normalize_latent_channels
+        # pass init params to Decoder
+        quant_dims = 2 if dims == 2 else 3
+        self.decoder = decoder
+        if use_quant_conv:
+            self.quant_conv = make_conv_nd(
+                quant_dims, 2 * latent_channels, 2 * latent_channels, 1
+            )
+            self.post_quant_conv = make_conv_nd(
+                quant_dims, latent_channels, latent_channels, 1
+            )
+        else:
+            self.quant_conv = nn.Identity()
+            self.post_quant_conv = nn.Identity()
+        if normalize_latent_channels:
+            if dims == 2:
+                self.latent_norm_out = nn.BatchNorm2d(latent_channels, affine=False)
+            else:
+                self.latent_norm_out = nn.BatchNorm3d(latent_channels, affine=False)
+        else:
+            self.latent_norm_out = nn.Identity()
+        self.use_z_tiling = False
+        self.use_hw_tiling = False
+        self.dims = dims
+        self.z_sample_size = 1
+        self.decoder_params = inspect.signature(self.decoder.forward).parameters
+        # only relevant if vae tiling is enabled
+        self.set_tiling_params(sample_size=sample_size, overlap_factor=0.25)
+    def set_tiling_params(self, sample_size: int = 512, overlap_factor: float = 0.25):
+        self.tile_sample_min_size = sample_size
+        num_blocks = len(self.encoder.down_blocks)
+        self.tile_latent_min_size = int(sample_size / (2 ** (num_blocks - 1)))
+        self.tile_overlap_factor = overlap_factor
+    def enable_z_tiling(self, z_sample_size: int = 8):
+        r"""
+        Enable tiling during VAE decoding.
+        When this option is enabled, the VAE will split the input tensor in tiles to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_z_tiling = z_sample_size > 1
+        self.z_sample_size = z_sample_size
+        assert (
+            z_sample_size % 8 == 0 or z_sample_size == 1
+        ), f"z_sample_size must be a multiple of 8 or 1. Got {z_sample_size}."
+    def disable_z_tiling(self):
+        r"""
+        Disable tiling during VAE decoding. If `use_tiling` was previously invoked, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_z_tiling = False
+    def enable_hw_tiling(self):
+        r"""
+        Enable tiling during VAE decoding along the height and width dimension.
+        """
+        self.use_hw_tiling = True
+    def disable_hw_tiling(self):
+        r"""
+        Disable tiling during VAE decoding along the height and width dimension.
+        """
+        self.use_hw_tiling = False
+    def _hw_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True):
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[3], overlap_size):
+            row = []
+            for j in range(0, x.shape[4], overlap_size):
+                tile = x[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=4))
+        moments = torch.cat(result_rows, dim=3)
+        return moments
+    def blend_z(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for z in range(blend_extent):
+            b[:, :, z, :, :] = a[:, :, -blend_extent + z, :, :] * (
+                1 - z / blend_extent
+            ) + b[:, :, z, :, :] * (z / blend_extent)
+        return b
+    def blend_v(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (
+                1 - y / blend_extent
+            ) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(
+        self, a: torch.Tensor, b: torch.Tensor, blend_extent: int
+    ) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (
+                1 - x / blend_extent
+            ) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def _hw_tiled_decode(self, z: torch.FloatTensor, target_shape):
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        tile_target_shape = (
+            *target_shape[:3],
+            self.tile_sample_min_size,
+            self.tile_sample_min_size,
+        )
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[3], overlap_size):
+            row = []
+            for j in range(0, z.shape[4], overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile, target_shape=tile_target_shape)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=4))
+        dec = torch.cat(result_rows, dim=3)
+        return dec
+    def encode(
+        self, z: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
+            num_splits = z.shape[2] // self.z_sample_size
+            sizes = [self.z_sample_size] * num_splits
+            sizes = (
+                sizes + [z.shape[2] - sum(sizes)]
+                if z.shape[2] - sum(sizes) > 0
+                else sizes
+            )
+            tiles = z.split(sizes, dim=2)
+            moments_tiles = [
+                (
+                    self._hw_tiled_encode(z_tile, return_dict)
+                    if self.use_hw_tiling
+                    else self._encode(z_tile)
+                )
+                for z_tile in tiles
+            ]
+            moments = torch.cat(moments_tiles, dim=2)
+        else:
+            moments = (
+                self._hw_tiled_encode(z, return_dict)
+                if self.use_hw_tiling
+                else self._encode(z)
+            )
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _normalize_latent_channels(self, z: torch.FloatTensor) -> torch.FloatTensor:
+        if isinstance(self.latent_norm_out, nn.BatchNorm3d):
+            _, c, _, _, _ = z.shape
+            z = torch.cat(
+                [
+                    self.latent_norm_out(z[:, : c // 2, :, :, :]),
+                    z[:, c // 2 :, :, :, :],
+                ],
+                dim=1,
+            )
+        elif isinstance(self.latent_norm_out, nn.BatchNorm2d):
+            raise NotImplementedError("BatchNorm2d not supported")
+        return z
+    def _unnormalize_latent_channels(self, z: torch.FloatTensor) -> torch.FloatTensor:
+        if isinstance(self.latent_norm_out, nn.BatchNorm3d):
+            running_mean = self.latent_norm_out.running_mean.view(1, -1, 1, 1, 1)
+            running_var = self.latent_norm_out.running_var.view(1, -1, 1, 1, 1)
+            eps = self.latent_norm_out.eps
+            z = z * torch.sqrt(running_var + eps) + running_mean
+        elif isinstance(self.latent_norm_out, nn.BatchNorm3d):
+            raise NotImplementedError("BatchNorm2d not supported")
+        return z
+    def _encode(self, x: torch.FloatTensor) -> AutoencoderKLOutput:
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        moments = self._normalize_latent_channels(moments)
+        return moments
+    def _decode(
+        self,
+        z: torch.FloatTensor,
+        target_shape=None,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        z = self._unnormalize_latent_channels(z)
+        z = self.post_quant_conv(z)
+        if "timestep" in self.decoder_params:
+            dec = self.decoder(z, target_shape=target_shape, timestep=timestep)
+        else:
+            dec = self.decoder(z, target_shape=target_shape)
+        return dec
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        return_dict: bool = True,
+        target_shape=None,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert target_shape is not None, "target_shape must be provided for decoding"
+        if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
+            reduction_factor = int(
+                self.encoder.patch_size_t
+                * 2
+                ** (
+                    len(self.encoder.down_blocks)
+                    - 1
+                    - math.sqrt(self.encoder.patch_size)
+                )
+            )
+            split_size = self.z_sample_size // reduction_factor
+            num_splits = z.shape[2] // split_size
+            # copy target shape, and divide frame dimension (=2) by the context size
+            target_shape_split = list(target_shape)
+            target_shape_split[2] = target_shape[2] // num_splits
+            decoded_tiles = [
+                (
+                    self._hw_tiled_decode(z_tile, target_shape_split)
+                    if self.use_hw_tiling
+                    else self._decode(z_tile, target_shape=target_shape_split)
+                )
+                for z_tile in torch.tensor_split(z, num_splits, dim=2)
+            ]
+            decoded = torch.cat(decoded_tiles, dim=2)
+        else:
+            decoded = (
+                self._hw_tiled_decode(z, target_shape)
+                if self.use_hw_tiling
+                else self._decode(z, target_shape=target_shape, timestep=timestep)
+            )
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`DecoderOutput`] instead of a plain tuple.
+            generator (`torch.Generator`, *optional*):
+                Generator used to sample from the posterior.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, target_shape=sample.shape).sample
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)

flash_head/ltx_video/models/autoencoders/vae_encode.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from typing import Tuple
+import torch
+from diffusers import AutoencoderKL
+from einops import rearrange
+from torch import Tensor
+from flash_head.ltx_video.models.autoencoders.causal_video_autoencoder import (
+    CausalVideoAutoencoder,
+)
+from flash_head.ltx_video.models.autoencoders.video_autoencoder import (
+    Downsample3D,
+    VideoAutoencoder,
+)
+try:
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    xm = None
+def vae_encode(
+    media_items: Tensor,
+    vae: AutoencoderKL,
+    split_size: int = 1,
+    vae_per_channel_normalize=False,
+) -> Tensor:
+    """
+    Encodes media items (images or videos) into latent representations using a specified VAE model.
+    The function supports processing batches of images or video frames and can handle the processing
+    in smaller sub-batches if needed.
+    Args:
+        media_items (Tensor): A torch Tensor containing the media items to encode. The expected
+            shape is (batch_size, channels, height, width) for images or (batch_size, channels,
+            frames, height, width) for videos.
+        vae (AutoencoderKL): An instance of the `AutoencoderKL` class from the `diffusers` library,
+            pre-configured and loaded with the appropriate model weights.
+        split_size (int, optional): The number of sub-batches to split the input batch into for encoding.
+            If set to more than 1, the input media items are processed in smaller batches according to
+            this value. Defaults to 1, which processes all items in a single batch.
+    Returns:
+        Tensor: A torch Tensor of the encoded latent representations. The shape of the tensor is adjusted
+            to match the input shape, scaled by the model's configuration.
+    Examples:
+        >>> import torch
+        >>> from diffusers import AutoencoderKL
+        >>> vae = AutoencoderKL.from_pretrained('your-model-name')
+        >>> images = torch.rand(10, 3, 8 256, 256)  # Example tensor with 10 videos of 8 frames.
+        >>> latents = vae_encode(images, vae)
+        >>> print(latents.shape)  # Output shape will depend on the model's latent configuration.
+    Note:
+        In case of a video, the function encodes the media item frame-by frame.
+    """
+    is_video_shaped = media_items.dim() == 5
+    batch_size, channels = media_items.shape[0:2]
+    if channels != 3:
+        raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
+    # if is_video_shaped and not isinstance(
+    #     vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    # ): #这里经过fsdp包裹之后，无法判断了，因此后面的条件就不要了
+    if is_video_shaped and False: #False是为了兼容fsdp包裹之后的模型
+        media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
+    if split_size > 1:
+        if len(media_items) % split_size != 0:
+            raise ValueError(
+                "Error: The batch size must be divisible by 'train.vae_bs_split"
+            )
+        encode_bs = len(media_items) // split_size
+        # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
+        latents = []
+        if media_items.device.type == "xla":
+            xm.mark_step()
+        for image_batch in media_items.split(encode_bs):
+            latents.append(vae.encode(image_batch).latent_dist.sample())
+            if media_items.device.type == "xla":
+                xm.mark_step()
+        latents = torch.cat(latents, dim=0)
+    else:
+        latents = vae.encode(media_items).latent_dist.sample()
+    latents = normalize_latents(latents, vae, vae_per_channel_normalize)
+    # if is_video_shaped and not isinstance(
+    #     vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    # ):
+    if is_video_shaped and False: #False是为了兼容fsdp包裹之后的模型
+        latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
+    return latents
+def vae_decode(
+    latents: Tensor,
+    vae: AutoencoderKL,
+    is_video: bool = True,
+    split_size: int = 1,
+    vae_per_channel_normalize=False,
+    timestep=None,
+) -> Tensor:
+    is_video_shaped = latents.dim() == 5
+    batch_size = latents.shape[0]
+    # if is_video_shaped and not isinstance(
+    #     vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    # ):
+    if is_video_shaped and False: #False是为了兼容fsdp包裹之后的模型
+        latents = rearrange(latents, "b c n h w -> (b n) c h w")
+    if split_size > 1:
+        if len(latents) % split_size != 0:
+            raise ValueError(
+                "Error: The batch size must be divisible by 'train.vae_bs_split"
+            )
+        encode_bs = len(latents) // split_size
+        image_batch = [
+            _run_decoder(
+                latent_batch, vae, is_video, vae_per_channel_normalize, timestep
+            )
+            for latent_batch in latents.split(encode_bs)
+        ]
+        images = torch.cat(image_batch, dim=0)
+    else:
+        images = _run_decoder(
+            latents, vae, is_video, vae_per_channel_normalize, timestep
+        )
+    # if is_video_shaped and not isinstance(
+    #     vae, (VideoAutoencoder, CausalVideoAutoencoder)
+    # ):
+    if is_video_shaped and False: #False是为了兼容fsdp包裹之后的模型
+        images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
+    return images
+def _run_decoder(
+    latents: Tensor,
+    vae: AutoencoderKL,
+    is_video: bool,
+    vae_per_channel_normalize=False,
+    timestep=None,
+) -> Tensor:
+    # if isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
+    if False: #True是为了兼容fsdp包裹之后的模型
+        *_, fl, hl, wl = latents.shape
+        temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
+        latents = latents.to(vae.dtype)
+        vae_decode_kwargs = {}
+        if timestep is not None:
+            vae_decode_kwargs["timestep"] = timestep
+        image = vae.decode(
+            un_normalize_latents(latents, vae, vae_per_channel_normalize),
+            return_dict=False,
+            target_shape=(
+                1,
+                3,
+                fl * temporal_scale if is_video else 1,
+                hl * spatial_scale,
+                wl * spatial_scale,
+            ),
+            **vae_decode_kwargs,
+        )[0]
+    else:
+        image = vae.decode(
+            un_normalize_latents(latents, vae, vae_per_channel_normalize),
+            return_dict=False,
+            target_shape=latents.shape
+        )[0]
+    return image
+def get_vae_size_scale_factor(vae: AutoencoderKL) -> float:
+    # if isinstance(vae, CausalVideoAutoencoder):
+    if True: #True是为了兼容fsdp包裹之后的模型
+        spatial = vae.spatial_downscale_factor
+        temporal = vae.temporal_downscale_factor
+    else:
+        down_blocks = len(
+            [
+                block
+                for block in vae.encoder.down_blocks
+                if isinstance(block.downsample, Downsample3D)
+            ]
+        )
+        spatial = vae.config.patch_size * 2**down_blocks
+        temporal = (
+            vae.config.patch_size_t * 2**down_blocks
+            if isinstance(vae, VideoAutoencoder)
+            else 1
+        )
+    return (temporal, spatial, spatial)
+def latent_to_pixel_coords(
+    latent_coords: Tensor, vae: AutoencoderKL, causal_fix: bool = False
+) -> Tensor:
+    """
+    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+    configuration.
+    Args:
+        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+        containing the latent corner coordinates of each token.
+        vae (AutoencoderKL): The VAE model
+        causal_fix (bool): Whether to take into account the different temporal scale
+            of the first frame. Default = False for backwards compatibility.
+    Returns:
+        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
+    """
+    scale_factors = get_vae_size_scale_factor(vae)
+    # causal_fix = isinstance(vae, CausalVideoAutoencoder) and causal_fix
+    causal_fix = True and causal_fix #True是为了兼容fsdp包裹之后的模型
+    pixel_coords = latent_to_pixel_coords_from_factors(
+        latent_coords, scale_factors, causal_fix
+    )
+    return pixel_coords
+def latent_to_pixel_coords_from_factors(
+    latent_coords: Tensor, scale_factors: Tuple, causal_fix: bool = False
+) -> Tensor:
+    pixel_coords = (
+        latent_coords
+        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
+    )
+    if causal_fix:
+        # Fix temporal scale for first frame to 1 due to causality
+        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords
+def normalize_latents(
+    latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
+    return (
+        (latents - vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1))
+        / vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+        if vae_per_channel_normalize
+        else latents * vae.config.scaling_factor
+    )
+def un_normalize_latents(
+    latents: Tensor, vae: AutoencoderKL, vae_per_channel_normalize: bool = False
+) -> Tensor:
+    return (
+        latents * vae.std_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+        + vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
+        if vae_per_channel_normalize
+        else latents / vae.config.scaling_factor
+    )

flash_head/ltx_video/models/autoencoders/video_autoencoder.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+import json
+import os
+from functools import partial
+from types import SimpleNamespace
+from typing import Any, Mapping, Optional, Tuple, Union
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import functional
+from diffusers.utils import logging
+from flash_head.ltx_video.utils.torch_utils import Identity
+from flash_head.ltx_video.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
+from flash_head.ltx_video.models.autoencoders.pixel_norm import PixelNorm
+from flash_head.ltx_video.models.autoencoders.vae import AutoencoderKLWrapper
+logger = logging.get_logger(__name__)
+class VideoAutoencoder(AutoencoderKLWrapper):
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *args,
+        **kwargs,
+    ):
+        config_local_path = pretrained_model_name_or_path / "config.json"
+        config = cls.load_config(config_local_path, **kwargs)
+        video_vae = cls.from_config(config)
+        video_vae.to(kwargs["torch_dtype"])
+        model_local_path = pretrained_model_name_or_path / "autoencoder.pth"
+        ckpt_state_dict = torch.load(model_local_path)
+        video_vae.load_state_dict(ckpt_state_dict)
+        statistics_local_path = (
+            pretrained_model_name_or_path / "per_channel_statistics.json"
+        )
+        if statistics_local_path.exists():
+            with open(statistics_local_path, "r") as file:
+                data = json.load(file)
+            transposed_data = list(zip(*data["data"]))
+            data_dict = {
+                col: torch.tensor(vals)
+                for col, vals in zip(data["columns"], transposed_data)
+            }
+            video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
+            video_vae.register_buffer(
+                "mean_of_means",
+                data_dict.get(
+                    "mean-of-means", torch.zeros_like(data_dict["std-of-means"])
+                ),
+            )
+        return video_vae
+    @staticmethod
+    def from_config(config):
+        assert (
+            config["_class_name"] == "VideoAutoencoder"
+        ), "config must have _class_name=VideoAutoencoder"
+        if isinstance(config["dims"], list):
+            config["dims"] = tuple(config["dims"])
+        assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+        use_quant_conv = config.get("use_quant_conv", True)
+        if use_quant_conv and latent_log_var == "uniform":
+            raise ValueError("uniform latent_log_var requires use_quant_conv=False")
+        encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            block_out_channels=config["block_out_channels"],
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+            patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+            add_channel_padding=config.get("add_channel_padding", False),
+        )
+        decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            block_out_channels=config["block_out_channels"],
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+            add_channel_padding=config.get("add_channel_padding", False),
+        )
+        dims = config["dims"]
+        return VideoAutoencoder(
+            encoder=encoder,
+            decoder=decoder,
+            latent_channels=config["latent_channels"],
+            dims=dims,
+            use_quant_conv=use_quant_conv,
+        )
+    @property
+    def config(self):
+        return SimpleNamespace(
+            _class_name="VideoAutoencoder",
+            dims=self.dims,
+            in_channels=self.encoder.conv_in.in_channels
+            // (self.encoder.patch_size_t * self.encoder.patch_size**2),
+            out_channels=self.decoder.conv_out.out_channels
+            // (self.decoder.patch_size_t * self.decoder.patch_size**2),
+            latent_channels=self.decoder.conv_in.in_channels,
+            block_out_channels=[
+                self.encoder.down_blocks[i].res_blocks[-1].conv1.out_channels
+                for i in range(len(self.encoder.down_blocks))
+            ],
+            scaling_factor=1.0,
+            norm_layer=self.encoder.norm_layer,
+            patch_size=self.encoder.patch_size,
+            latent_log_var=self.encoder.latent_log_var,
+            use_quant_conv=self.use_quant_conv,
+            patch_size_t=self.encoder.patch_size_t,
+            add_channel_padding=self.encoder.add_channel_padding,
+        )
+    @property
+    def is_video_supported(self):
+        """
+        Check if the model supports video inputs of shape (B, C, F, H, W). Otherwise, the model only supports 2D images.
+        """
+        return self.dims != 2
+    @property
+    def downscale_factor(self):
+        return self.encoder.downsample_factor
+    def to_json_string(self) -> str:
+        import json
+        return json.dumps(self.config.__dict__)
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        model_keys = set(name for name, _ in self.named_parameters())
+        key_mapping = {
+            ".resnets.": ".res_blocks.",
+            "downsamplers.0": "downsample",
+            "upsamplers.0": "upsample",
+        }
+        converted_state_dict = {}
+        for key, value in state_dict.items():
+            for k, v in key_mapping.items():
+                key = key.replace(k, v)
+            if "norm" in key and key not in model_keys:
+                logger.info(
+                    f"Removing key {key} from state_dict as it is not present in the model"
+                )
+                continue
+            converted_state_dict[key] = value
+        super().load_state_dict(converted_state_dict, strict=strict)
+    def last_layer(self):
+        if hasattr(self.decoder, "conv_out"):
+            if isinstance(self.decoder.conv_out, nn.Sequential):
+                last_layer = self.decoder.conv_out[-1]
+            else:
+                last_layer = self.decoder.conv_out
+        else:
+            last_layer = self.decoder.layers[-1]
+        return last_layer
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+        patch_size_t: Optional[int] = None,
+        add_channel_padding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+        self.add_channel_padding = add_channel_padding
+        self.layers_per_block = layers_per_block
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        if add_channel_padding:
+            in_channels = in_channels * self.patch_size**3
+        else:
+            in_channels = in_channels * self.patch_size_t * self.patch_size**2
+        self.in_channels = in_channels
+        output_channel = block_out_channels[0]
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.down_blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels)):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = DownEncoderBlock3D(
+                dims=dims,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block,
+                add_downsample=not is_final_block and 2**i >= patch_size,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_groups=norm_num_groups,
+                norm_layer=norm_layer,
+            )
+            self.down_blocks.append(down_block)
+        self.mid_block = UNetMidBlock3D(
+            dims=dims,
+            in_channels=block_out_channels[-1],
+            num_layers=self.layers_per_block,
+            resnet_eps=1e-6,
+            resnet_groups=norm_num_groups,
+            norm_layer=norm_layer,
+        )
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[-1],
+                num_groups=norm_num_groups,
+                eps=1e-6,
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        self.conv_act = nn.SiLU()
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(
+            dims, block_out_channels[-1], conv_out_channels, 3, padding=1
+        )
+        self.gradient_checkpointing = False
+    @property
+    def downscale_factor(self):
+        return (
+            2
+            ** len(
+                [
+                    block
+                    for block in self.down_blocks
+                    if isinstance(block.downsample, Downsample3D)
+                ]
+            )
+            * self.patch_size
+        )
+    def forward(
+        self, sample: torch.FloatTensor, return_features=False
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        downsample_in_time = sample.shape[2] != 1
+        # patchify
+        patch_size_t = self.patch_size_t if downsample_in_time else 1
+        sample = patchify(
+            sample,
+            patch_size_hw=self.patch_size,
+            patch_size_t=patch_size_t,
+            add_channel_padding=self.add_channel_padding,
+        )
+        sample = self.conv_in(sample)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        if return_features:
+            features = []
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(
+                sample, downsample_in_time=downsample_in_time
+            )
+            if return_features:
+                features.append(sample)
+        sample = checkpoint_fn(self.mid_block)(sample)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(
+                    1, sample.shape[1] - 2, 1, 1, 1
+                )
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+        if return_features:
+            features.append(sample[:, : self.latent_channels, ...])
+            return sample, features
+        return sample
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+    """
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        patch_size_t: Optional[int] = None,
+        add_channel_padding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+        self.add_channel_padding = add_channel_padding
+        self.layers_per_block = layers_per_block
+        if add_channel_padding:
+            out_channels = out_channels * self.patch_size**3
+        else:
+            out_channels = out_channels * self.patch_size_t * self.patch_size**2
+        self.out_channels = out_channels
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.mid_block = UNetMidBlock3D(
+            dims=dims,
+            in_channels=block_out_channels[-1],
+            num_layers=self.layers_per_block,
+            resnet_eps=1e-6,
+            resnet_groups=norm_num_groups,
+            norm_layer=norm_layer,
+        )
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(reversed_block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpDecoderBlock3D(
+                dims=dims,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block
+                and 2 ** (len(block_out_channels) - i - 1) > patch_size,
+                resnet_eps=1e-6,
+                resnet_groups=norm_num_groups,
+                norm_layer=norm_layer,
+            )
+            self.up_blocks.append(up_block)
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6
+            )
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(
+            dims, block_out_channels[0], out_channels, 3, padding=1
+        )
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        assert target_shape is not None, "target_shape must be provided"
+        upsample_in_time = sample.shape[2] < target_shape[2]
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        sample = checkpoint_fn(self.mid_block)(sample)
+        sample = sample.to(upscale_dtype)
+        for up_block in self.up_blocks:
+            sample = checkpoint_fn(up_block)(sample, upsample_in_time=upsample_in_time)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # un-patchify
+        patch_size_t = self.patch_size_t if upsample_in_time else 1
+        sample = unpatchify(
+            sample,
+            patch_size_hw=self.patch_size,
+            patch_size_t=patch_size_t,
+            add_channel_padding=self.add_channel_padding,
+        )
+        return sample
+class DownEncoderBlock3D(nn.Module):
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        res_blocks = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            res_blocks.append(
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+            )
+        self.res_blocks = nn.ModuleList(res_blocks)
+        if add_downsample:
+            self.downsample = Downsample3D(
+                dims,
+                out_channels,
+                out_channels=out_channels,
+                padding=downsample_padding,
+            )
+        else:
+            self.downsample = Identity()
+    def forward(
+        self, hidden_states: torch.FloatTensor, downsample_in_time
+    ) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        hidden_states = self.downsample(
+            hidden_states, downsample_in_time=downsample_in_time
+        )
+        return hidden_states
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        return hidden_states
+class UpDecoderBlock3D(nn.Module):
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        add_upsample: bool = True,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        res_blocks = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            res_blocks.append(
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+            )
+        self.res_blocks = nn.ModuleList(res_blocks)
+        if add_upsample:
+            self.upsample = Upsample3D(
+                dims=dims, channels=out_channels, out_channels=out_channels
+            )
+        else:
+            self.upsample = Identity()
+        self.resolution_idx = resolution_idx
+    def forward(
+        self, hidden_states: torch.FloatTensor, upsample_in_time=True
+    ) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        hidden_states = self.upsample(hidden_states, upsample_in_time=upsample_in_time)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        if norm_layer == "group_norm":
+            self.norm1 = torch.nn.GroupNorm(
+                num_groups=groups, num_channels=in_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(
+            dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if norm_layer == "group_norm":
+            self.norm2 = torch.nn.GroupNorm(
+                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
+            )
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(
+            dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.conv_shortcut = (
+            make_linear_nd(
+                dims=dims, in_channels=in_channels, out_channels=out_channels
+            )
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+class Downsample3D(nn.Module):
+    def __init__(
+        self,
+        dims,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        padding: int = 1,
+    ):
+        super().__init__()
+        stride: int = 2
+        self.padding = padding
+        self.in_channels = in_channels
+        self.dims = dims
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+    def forward(self, x, downsample_in_time=True):
+        conv = self.conv
+        if self.padding == 0:
+            if self.dims == 2:
+                padding = (0, 1, 0, 1)
+            else:
+                padding = (0, 1, 0, 1, 0, 1 if downsample_in_time else 0)
+            x = functional.pad(x, padding, mode="constant", value=0)
+            if self.dims == (2, 1) and not downsample_in_time:
+                return conv(x, skip_time_conv=True)
+        return conv(x)
+class Upsample3D(nn.Module):
+    """
+    An upsampling layer for 3D tensors of shape (B, C, D, H, W).
+    :param channels: channels in the inputs and outputs.
+    """
+    def __init__(self, dims, channels, out_channels=None):
+        super().__init__()
+        self.dims = dims
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.conv = make_conv_nd(
+            dims, channels, out_channels, kernel_size=3, padding=1, bias=True
+        )
+    def forward(self, x, upsample_in_time):
+        if self.dims == 2:
+            x = functional.interpolate(
+                x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+            )
+        else:
+            time_scale_factor = 2 if upsample_in_time else 1
+            # print("before:", x.shape)
+            b, c, d, h, w = x.shape
+            x = rearrange(x, "b c d h w -> (b d) c h w")
+            # height and width interpolate
+            x = functional.interpolate(
+                x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest"
+            )
+            _, _, h, w = x.shape
+            if not upsample_in_time and self.dims == (2, 1):
+                x = rearrange(x, "(b d) c h w -> b c d h w ", b=b, h=h, w=w)
+                return self.conv(x, skip_time_conv=True)
+            # Second ** upsampling ** which is essentially treated as a 1D convolution across the 'd' dimension
+            x = rearrange(x, "(b d) c h w -> (b h w) c 1 d", b=b)
+            # (b h w) c 1 d
+            new_d = x.shape[-1] * time_scale_factor
+            x = functional.interpolate(x, (1, new_d), mode="nearest")
+            # (b h w) c 1 new_d
+            x = rearrange(
+                x, "(b h w) c 1 new_d  -> b c new_d h w", b=b, h=h, w=w, new_d=new_d
+            )
+            # b c d h w
+            # x = functional.interpolate(
+            #     x, (x.shape[2] * time_scale_factor, x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            # )
+            # print("after:", x.shape)
+        return self.conv(x)
+def patchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c (f p) (h q) (w r) -> b (c p r q) f h w",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    if (
+        (x.dim() == 5)
+        and (patch_size_hw > patch_size_t)
+        and (patch_size_t > 1 or add_channel_padding)
+    ):
+        channels_to_pad = x.shape[1] * (patch_size_hw // patch_size_t) - x.shape[1]
+        padding_zeros = torch.zeros(
+            x.shape[0],
+            channels_to_pad,
+            x.shape[2],
+            x.shape[3],
+            x.shape[4],
+            device=x.device,
+            dtype=x.dtype,
+        )
+        x = torch.cat([padding_zeros, x], dim=1)
+    return x
+def unpatchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if (
+        (x.dim() == 5)
+        and (patch_size_hw > patch_size_t)
+        and (patch_size_t > 1 or add_channel_padding)
+    ):
+        channels_to_keep = int(x.shape[1] * (patch_size_t / patch_size_hw))
+        x = x[:, :channels_to_keep, :, :, :]
+    if x.dim() == 4:
+        x = rearrange(
+            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw
+        )
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c p r q) f h w -> b c (f p) (h q) (w r)",
+            p=patch_size_t,
+            q=patch_size_hw,
+            r=patch_size_hw,
+        )
+    return x
+def create_video_autoencoder_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": (
+            2,
+            1,
+        ),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [
+            128,
+            256,
+            512,
+            512,
+        ],  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 1,
+    }
+    return config
+def create_video_autoencoder_pathify4x4x4_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": (
+            2,
+            1,
+        ),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512]
+        * 4,  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 4,
+        "latent_log_var": "uniform",
+    }
+    return config
+def create_video_autoencoder_pathify4x4_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": 2,  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512]
+        * 4,  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 4,
+        "norm_layer": "pixel_norm",
+    }
+    return config
+def test_vae_patchify_unpatchify():
+    import torch
+    x = torch.randn(2, 3, 8, 64, 64)
+    x_patched = patchify(x, patch_size_hw=4, patch_size_t=4)
+    x_unpatched = unpatchify(x_patched, patch_size_hw=4, patch_size_t=4)
+    assert torch.allclose(x, x_unpatched)
+def demo_video_autoencoder_forward_backward():
+    # Configuration for the VideoAutoencoder
+    config = create_video_autoencoder_pathify4x4x4_config()
+    # Instantiate the VideoAutoencoder with the specified configuration
+    video_autoencoder = VideoAutoencoder.from_config(config)
+    print(video_autoencoder)
+    # Print the total number of parameters in the video autoencoder
+    total_params = sum(p.numel() for p in video_autoencoder.parameters())
+    print(f"Total number of parameters in VideoAutoencoder: {total_params:,}")
+    # Create a mock input tensor simulating a batch of videos
+    # Shape: (batch_size, channels, depth, height, width)
+    # E.g., 4 videos, each with 3 color channels, 16 frames, and 64x64 pixels per frame
+    input_videos = torch.randn(2, 3, 8, 64, 64)
+    # Forward pass: encode and decode the input videos
+    latent = video_autoencoder.encode(input_videos).latent_dist.mode()
+    print(f"input shape={input_videos.shape}")
+    print(f"latent shape={latent.shape}")
+    reconstructed_videos = video_autoencoder.decode(
+        latent, target_shape=input_videos.shape
+    ).sample
+    print(f"reconstructed shape={reconstructed_videos.shape}")
+    # Calculate the loss (e.g., mean squared error)
+    loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)
+    # Perform backward pass
+    loss.backward()
+    print(f"Demo completed with loss: {loss.item()}")
+# Ensure to call the demo function to execute the forward and backward pass
+if __name__ == "__main__":
+    demo_video_autoencoder_forward_backward()

flash_head/ltx_video/models/transformers/__init__.py ADDED Viewed

File without changes

flash_head/ltx_video/models/transformers/attention.py ADDED Viewed

	@@ -0,0 +1,1265 @@

+import inspect
+from importlib import import_module
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention import _chunked_feed_forward
+from diffusers.models.attention_processor import (
+    LoRAAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    SpatialNorm,
+)
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import RMSNorm
+from diffusers.utils import deprecate, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from einops import rearrange
+from torch import nn
+from flash_head.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+try:
+    from torch_xla.experimental.custom_kernel import flash_attention
+except ImportError:
+    # workaround for automatic tests. Currently this function is manually patched
+    # to the torch_xla lib on setup of container
+    pass
+# code adapted from  https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+logger = logging.get_logger(__name__)
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        qk_norm (`str`, *optional*, defaults to None):
+            Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
+        adaptive_norm (`str`, *optional*, defaults to `"single_scale_shift"`):
+            The type of adaptive norm to use. Can be `"single_scale_shift"`, `"single_scale"` or "none".
+        standardization_norm (`str`, *optional*, defaults to `"layer_norm"`):
+            The type of pre-normalization to use. Can be `"layer_norm"` or `"rms_norm"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,  # pylint: disable=unused-argument
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        adaptive_norm: str = "single_scale_shift",  # 'single_scale_shift', 'single_scale' or 'none'
+        standardization_norm: str = "layer_norm",  # 'layer_norm' or 'rms_norm'
+        norm_eps: float = 1e-5,
+        qk_norm: Optional[str] = None,
+        final_dropout: bool = False,
+        attention_type: str = "default",  # pylint: disable=unused-argument
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+        use_tpu_flash_attention: bool = False,
+        use_rope: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_tpu_flash_attention = use_tpu_flash_attention
+        self.adaptive_norm = adaptive_norm
+        assert standardization_norm in ["layer_norm", "rms_norm"]
+        assert adaptive_norm in ["single_scale_shift", "single_scale", "none"]
+        make_norm_layer = (
+            nn.LayerNorm if standardization_norm == "layer_norm" else RMSNorm
+        )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm1 = make_norm_layer(
+            dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+        )
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+            use_tpu_flash_attention=use_tpu_flash_attention,
+            qk_norm=qk_norm,
+            use_rope=use_rope,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+                use_tpu_flash_attention=use_tpu_flash_attention,
+                qk_norm=qk_norm,
+                use_rope=use_rope,
+            )  # is self-attn if encoder_hidden_states is none
+            if adaptive_norm == "none":
+                self.attn2_norm = make_norm_layer(
+                    dim, norm_eps, norm_elementwise_affine
+                )
+        else:
+            self.attn2 = None
+            self.attn2_norm = None
+        self.norm2 = make_norm_layer(dim, norm_eps, norm_elementwise_affine)
+        # 3. Feed-forward
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        # 5. Scale-shift for PixArt-Alpha.
+        if adaptive_norm != "none":
+            num_ada_params = 4 if adaptive_norm == "single_scale" else 6
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(num_ada_params, dim) / dim**0.5
+            )
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_use_tpu_flash_attention(self):
+        r"""
+        Function sets the flag in this object and propagates down the children. The flag will enforce the usage of TPU
+        attention kernel.
+        """
+        self.use_tpu_flash_attention = True
+        self.attn1.set_use_tpu_flash_attention()
+        self.attn2.set_use_tpu_flash_attention()
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        freqs_cis: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        skip_layer_mask: Optional[torch.Tensor] = None,
+        skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored."
+                )
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        original_hidden_states = hidden_states
+        norm_hidden_states = self.norm1(hidden_states)
+        # Apply ada_norm_single
+        if self.adaptive_norm in ["single_scale_shift", "single_scale"]:
+            assert timestep.ndim == 3  # [batch, 1 or num_tokens, embedding_dim]
+            num_ada_params = self.scale_shift_table.shape[0]
+            ada_values = self.scale_shift_table[None, None] + timestep.reshape(
+                batch_size, timestep.shape[1], num_ada_params, -1
+            )
+            if self.adaptive_norm == "single_scale_shift":
+                shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                    ada_values.unbind(dim=2)
+                )
+                norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            else:
+                scale_msa, gate_msa, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
+                norm_hidden_states = norm_hidden_states * (1 + scale_msa)
+        elif self.adaptive_norm == "none":
+            scale_msa, gate_msa, scale_mlp, gate_mlp = None, None, None, None
+        else:
+            raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
+        norm_hidden_states = norm_hidden_states.squeeze(
+            1
+        )  # TODO: Check if this is needed
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = (
+            cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        )
+        attn_output = self.attn1(
+            norm_hidden_states,
+            freqs_cis=freqs_cis,
+            encoder_hidden_states=(
+                encoder_hidden_states if self.only_cross_attention else None
+            ),
+            attention_mask=attention_mask,
+            skip_layer_mask=skip_layer_mask,
+            skip_layer_strategy=skip_layer_strategy,
+            **cross_attention_kwargs,
+        )
+        if gate_msa is not None:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.adaptive_norm == "none":
+                attn_input = self.attn2_norm(hidden_states)
+            else:
+                attn_input = hidden_states
+            attn_output = self.attn2(
+                attn_input,
+                freqs_cis=freqs_cis,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm2(hidden_states)
+        if self.adaptive_norm == "single_scale_shift":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        elif self.adaptive_norm == "single_scale":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp)
+        elif self.adaptive_norm == "none":
+            pass
+        else:
+            raise ValueError(f"Unknown adaptive norm type: {self.adaptive_norm}")
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if gate_mlp is not None:
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        if (
+            skip_layer_mask is not None
+            and skip_layer_strategy == SkipLayerStrategy.TransformerBlock
+        ):
+            skip_layer_mask = skip_layer_mask.view(-1, 1, 1)
+            hidden_states = hidden_states * skip_layer_mask + original_hidden_states * (
+                1.0 - skip_layer_mask
+            )
+        return hidden_states
+@maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        qk_norm (`str`, *optional*, defaults to None):
+            Set to 'layer_norm' or `rms_norm` to perform query and key normalization.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        qk_norm: Optional[str] = None,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        use_tpu_flash_attention: bool = False,
+        use_rope: bool = False,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = (
+            cross_attention_dim if cross_attention_dim is not None else query_dim
+        )
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.use_tpu_flash_attention = use_tpu_flash_attention
+        self.use_rope = use_rope
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        if qk_norm is None:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+        elif qk_norm == "rms_norm":
+            self.q_norm = RMSNorm(dim_head * heads, eps=1e-5)
+            self.k_norm = RMSNorm(dim_head * heads, eps=1e-5)
+        elif qk_norm == "layer_norm":
+            self.q_norm = nn.LayerNorm(dim_head * heads, eps=1e-5)
+            self.k_norm = nn.LayerNorm(dim_head * heads, eps=1e-5)
+        else:
+            raise ValueError(f"Unsupported qk_norm method: {qk_norm}")
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(
+                num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True
+            )
+        else:
+            self.group_norm = None
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(
+                f_channels=query_dim, zq_channels=spatial_norm_dim
+            )
+        else:
+            self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels,
+                num_groups=cross_attention_norm_num_groups,
+                eps=1e-5,
+                affine=True,
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        linear_cls = nn.Linear
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+            self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = AttnProcessor2_0()
+        self.set_processor(processor)
+    def set_use_tpu_flash_attention(self):
+        r"""
+        Function sets the flag in this object. The flag will enforce the usage of TPU attention kernel.
+        """
+        self.use_tpu_flash_attention = True
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(
+                f"You are removing possibly trained weights of {self.processor} with {processor}"
+            )
+            self._modules.pop("processor")
+        self.processor = processor
+    def get_processor(
+        self, return_deprecated_lora: bool = False
+    ) -> "AttentionProcessor":  # noqa: F821
+        r"""
+        Get the attention processor in use.
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+        # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
+        # serialization format for LoRA Attention Processors. It should be deleted once the integration
+        # with PEFT is completed.
+        is_lora_activated = {
+            name: module.lora_layer is not None
+            for name, module in self.named_modules()
+            if hasattr(module, "lora_layer")
+        }
+        # 1. if no layer has a LoRA activated we can return the processor as usual
+        if not any(is_lora_activated.values()):
+            return self.processor
+        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
+        is_lora_activated.pop("add_k_proj", None)
+        is_lora_activated.pop("add_v_proj", None)
+        # 2. else it is not posssible that only some layers have LoRA activated
+        if not all(is_lora_activated.values()):
+            raise ValueError(
+                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
+            )
+        # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
+        non_lora_processor_cls_name = self.processor.__class__.__name__
+        lora_processor_cls = getattr(
+            import_module(__name__), "LoRA" + non_lora_processor_cls_name
+        )
+        hidden_size = self.inner_dim
+        # now create a LoRA attention processor from the LoRA layers
+        if lora_processor_cls in [
+            LoRAAttnProcessor,
+            LoRAAttnProcessor2_0,
+            LoRAXFormersAttnProcessor,
+        ]:
+            kwargs = {
+                "cross_attention_dim": self.cross_attention_dim,
+                "rank": self.to_q.lora_layer.rank,
+                "network_alpha": self.to_q.lora_layer.network_alpha,
+                "q_rank": self.to_q.lora_layer.rank,
+                "q_hidden_size": self.to_q.lora_layer.out_features,
+                "k_rank": self.to_k.lora_layer.rank,
+                "k_hidden_size": self.to_k.lora_layer.out_features,
+                "v_rank": self.to_v.lora_layer.rank,
+                "v_hidden_size": self.to_v.lora_layer.out_features,
+                "out_rank": self.to_out[0].lora_layer.rank,
+                "out_hidden_size": self.to_out[0].lora_layer.out_features,
+            }
+            if hasattr(self.processor, "attention_op"):
+                kwargs["attention_op"] = self.processor.attention_op
+            lora_processor = lora_processor_cls(hidden_size, **kwargs)
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(
+                self.to_out[0].lora_layer.state_dict()
+            )
+        elif lora_processor_cls == LoRAAttnAddedKVProcessor:
+            lora_processor = lora_processor_cls(
+                hidden_size,
+                cross_attention_dim=self.add_k_proj.weight.shape[0],
+                rank=self.to_q.lora_layer.rank,
+                network_alpha=self.to_q.lora_layer.network_alpha,
+            )
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(
+                self.to_out[0].lora_layer.state_dict()
+            )
+            # only save if used
+            if self.add_k_proj.lora_layer is not None:
+                lora_processor.add_k_proj_lora.load_state_dict(
+                    self.add_k_proj.lora_layer.state_dict()
+                )
+                lora_processor.add_v_proj_lora.load_state_dict(
+                    self.add_v_proj.lora_layer.state_dict()
+                )
+            else:
+                lora_processor.add_k_proj_lora = None
+                lora_processor.add_v_proj_lora = None
+        else:
+            raise ValueError(f"{lora_processor_cls} does not exist.")
+        return lora_processor
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        freqs_cis: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        skip_layer_mask: Optional[torch.Tensor] = None,
+        skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            skip_layer_mask (`torch.Tensor`, *optional*):
+                The skip layer mask to use. If `None`, no mask is applied.
+            skip_layer_strategy (`SkipLayerStrategy`, *optional*, defaults to `None`):
+                Controls which layers to skip for spatiotemporal guidance.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        attn_parameters = set(
+            inspect.signature(self.processor.__call__).parameters.keys()
+        )
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by"
+                f" {self.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {
+            k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters
+        }
+        return self.processor(
+            self,
+            hidden_states,
+            freqs_cis=freqs_cis,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            skip_layer_mask=skip_layer_mask,
+            skip_layer_strategy=skip_layer_strategy,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size
+        )
+        return tensor
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        if tensor.ndim == 3:
+            batch_size, seq_len, dim = tensor.shape
+            extra_dim = 1
+        else:
+            batch_size, extra_dim, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(
+            batch_size, seq_len * extra_dim, head_size, dim // head_size
+        )
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(
+                batch_size * head_size, seq_len * extra_dim, dim // head_size
+            )
+        return tensor
+    def get_attention_scores(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0],
+                query.shape[1],
+                key.shape[1],
+                dtype=query.dtype,
+                device=query.device,
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        target_length: int,
+        batch_size: int,
+        out_dim: int = 3,
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (
+                    attention_mask.shape[0],
+                    attention_mask.shape[1],
+                    target_length,
+                )
+                padding = torch.zeros(
+                    padding_shape,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert (
+            self.norm_cross is not None
+        ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+    @staticmethod
+    def apply_rotary_emb(
+        input_tensor: torch.Tensor,
+        freqs_cis: Tuple[torch.FloatTensor, torch.FloatTensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos_freqs = freqs_cis[0]
+        sin_freqs = freqs_cis[1]
+        t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2)
+        t1, t2 = t_dup.unbind(dim=-1)
+        t_dup = torch.stack((-t2, t1), dim=-1)
+        input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)")
+        out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs
+        return out
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        freqs_cis: Tuple[torch.FloatTensor, torch.FloatTensor],
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_layer_mask: Optional[torch.FloatTensor] = None,
+        skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if skip_layer_mask is not None:
+            skip_layer_mask = skip_layer_mask.reshape(batch_size, 1, 1)
+        if (attention_mask is not None) and (not attn.use_tpu_flash_attention):
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.q_norm(query)
+        if encoder_hidden_states is not None:
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(
+                    encoder_hidden_states
+                )
+            key = attn.to_k(encoder_hidden_states)
+            key = attn.k_norm(key)
+        else:  # if no context provided do self-attention
+            encoder_hidden_states = hidden_states
+            key = attn.to_k(hidden_states)
+            key = attn.k_norm(key)
+            if attn.use_rope:
+                key = attn.apply_rotary_emb(key, freqs_cis)
+                query = attn.apply_rotary_emb(query, freqs_cis)
+        value = attn.to_v(encoder_hidden_states)
+        value_for_stg = value
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        if attn.use_tpu_flash_attention:  # use tpu attention offload 'flash attention'
+            q_segment_indexes = None
+            if (
+                attention_mask is not None
+            ):  # if mask is required need to tune both segmenIds fields
+                # attention_mask = torch.squeeze(attention_mask).to(torch.float32)
+                attention_mask = attention_mask.to(torch.float32)
+                q_segment_indexes = torch.ones(
+                    batch_size, query.shape[2], device=query.device, dtype=torch.float32
+                )
+                assert (
+                    attention_mask.shape[1] == key.shape[2]
+                ), f"ERROR: KEY SHAPE must be same as attention mask [{key.shape[2]}, {attention_mask.shape[1]}]"
+            assert (
+                query.shape[2] % 128 == 0
+            ), f"ERROR: QUERY SHAPE must be divisible by 128 (TPU limitation) [{query.shape[2]}]"
+            assert (
+                key.shape[2] % 128 == 0
+            ), f"ERROR: KEY SHAPE must be divisible by 128 (TPU limitation) [{key.shape[2]}]"
+            # run the TPU kernel implemented in jax with pallas
+            hidden_states_a = flash_attention(
+                q=query,
+                k=key,
+                v=value,
+                q_segment_ids=q_segment_indexes,
+                kv_segment_ids=attention_mask,
+                sm_scale=attn.scale,
+            )
+        else:
+            hidden_states_a = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                dropout_p=0.0,
+                is_causal=False,
+            )
+        hidden_states_a = hidden_states_a.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states_a = hidden_states_a.to(query.dtype)
+        if (
+            skip_layer_mask is not None
+            and skip_layer_strategy == SkipLayerStrategy.AttentionSkip
+        ):
+            hidden_states = hidden_states_a * skip_layer_mask + hidden_states * (
+                1.0 - skip_layer_mask
+            )
+        elif (
+            skip_layer_mask is not None
+            and skip_layer_strategy == SkipLayerStrategy.AttentionValues
+        ):
+            hidden_states = hidden_states_a * skip_layer_mask + value_for_stg * (
+                1.0 - skip_layer_mask
+            )
+        else:
+            hidden_states = hidden_states_a
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+            if (
+                skip_layer_mask is not None
+                and skip_layer_strategy == SkipLayerStrategy.Residual
+            ):
+                skip_layer_mask = skip_layer_mask.reshape(batch_size, 1, 1, 1)
+        if attn.residual_connection:
+            if (
+                skip_layer_mask is not None
+                and skip_layer_strategy == SkipLayerStrategy.Residual
+            ):
+                hidden_states = hidden_states + residual * skip_layer_mask
+            else:
+                hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        query = attn.q_norm(query)
+        key = attn.k_norm(key)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        else:
+            raise ValueError(f"Unsupported activation function: {activation_fn}")
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states

flash_head/ltx_video/models/transformers/embeddings.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py
+import math
+import numpy as np
+import torch
+from einops import rearrange
+from torch import nn
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def get_3d_sincos_pos_embed(embed_dim, grid, w, h, f):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid = rearrange(grid, "c (f h w) -> c f h w", h=h, w=w)
+    grid = rearrange(grid, "c f h w -> c h w f", h=h, w=w)
+    grid = grid.reshape([3, 1, w, h, f])
+    pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+    pos_embed = pos_embed.transpose(1, 0, 2, 3)
+    return rearrange(pos_embed, "h w f c -> (f h w) c")
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 3 != 0:
+        raise ValueError("embed_dim must be divisible by 3")
+    # use half of dimensions to encode grid_h
+    emb_f = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0])  # (H*W*T, D/3)
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1])  # (H*W*T, D/3)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2])  # (H*W*T, D/3)
+    emb = np.concatenate([emb_h, emb_w, emb_f], axis=-1)  # (H*W*T, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos_shape = pos.shape
+    pos = pos.reshape(-1)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    out = out.reshape([*pos_shape, -1])[0]
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (M, D)
+    return emb
+class SinusoidalPositionalEmbedding(nn.Module):
+    """Apply positional information to a sequence of embeddings.
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+    """
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim)
+        )
+        pe = torch.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x

flash_head/ltx_video/models/transformers/symmetric_patchifier.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from abc import ABC, abstractmethod
+from typing import Tuple
+import torch
+from diffusers.configuration_utils import ConfigMixin
+from einops import rearrange
+from torch import Tensor
+class Patchifier(ConfigMixin, ABC):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self._patch_size = (1, patch_size, patch_size)
+    @abstractmethod
+    def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+        raise NotImplementedError("Patchify method not implemented")
+    @abstractmethod
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        pass
+    @property
+    def patch_size(self):
+        return self._patch_size
+    def get_latent_coords(
+        self, latent_num_frames, latent_height, latent_width, batch_size, device
+    ):
+        """
+        Return a tensor of shape [batch_size, 3, num_patches] containing the
+            top-left corner latent coordinates of each latent patch.
+        The tensor is repeated for each batch element.
+        """
+        latent_sample_coords = torch.meshgrid(
+            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+            torch.arange(0, latent_height, self._patch_size[1], device=device),
+            torch.arange(0, latent_width, self._patch_size[2], device=device),
+        )
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        )
+        return latent_coords
+class SymmetricPatchifier(Patchifier):
+    def patchify(self, latents: Tensor) -> Tuple[Tensor, Tensor]:
+        b, _, f, h, w = latents.shape
+        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
+        latents = rearrange(
+            latents,
+            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+            p1=self._patch_size[0],
+            p2=self._patch_size[1],
+            p3=self._patch_size[2],
+        )
+        return latents, latent_coords
+    def unpatchify(
+        self,
+        latents: Tensor,
+        output_height: int,
+        output_width: int,
+        out_channels: int,
+    ) -> Tuple[Tensor, Tensor]:
+        output_height = output_height // self._patch_size[1]
+        output_width = output_width // self._patch_size[2]
+        latents = rearrange(
+            latents,
+            "b (f h w) (c p q) -> b c f (h p) (w q)",
+            h=output_height,
+            w=output_width,
+            p=self._patch_size[1],
+            q=self._patch_size[2],
+        )
+        return latents

flash_head/ltx_video/models/transformers/transformer3d.py ADDED Viewed

	@@ -0,0 +1,507 @@

+# Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/models/transformers/transformer_2d.py
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+import os
+import json
+import glob
+from pathlib import Path
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import PixArtAlphaTextProjection
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils import logging
+from torch import nn
+from safetensors import safe_open
+from flash_head.ltx_video.models.transformers.attention import BasicTransformerBlock
+from flash_head.ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
+from flash_head.ltx_video.utils.diffusers_config_mapping import (
+    diffusers_and_ours_config_mapping,
+    make_hashable_key,
+    TRANSFORMER_KEYS_RENAME_DICT,
+)
+logger = logging.get_logger(__name__)
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        adaptive_norm: str = "single_scale_shift",  # 'single_scale_shift' or 'single_scale'
+        standardization_norm: str = "layer_norm",  # 'layer_norm' or 'rms_norm'
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+        use_tpu_flash_attention: bool = False,  # if True uses the TPU attention offload ('flash attention')
+        qk_norm: Optional[str] = None,
+        positional_embedding_type: str = "rope",
+        positional_embedding_theta: Optional[float] = None,
+        positional_embedding_max_pos: Optional[List[int]] = None,
+        timestep_scale_multiplier: Optional[float] = None,
+        causal_temporal_positioning: bool = False,  # For backward compatibility, will be deprecated
+    ):
+        super().__init__()
+        self.use_tpu_flash_attention = (
+            use_tpu_flash_attention  # FIXME: push config down to the attention modules
+        )
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        self.patchify_proj = nn.Linear(in_channels, inner_dim, bias=True)
+        self.positional_embedding_type = positional_embedding_type
+        self.positional_embedding_theta = positional_embedding_theta
+        self.positional_embedding_max_pos = positional_embedding_max_pos
+        self.use_rope = self.positional_embedding_type == "rope"
+        self.timestep_scale_multiplier = timestep_scale_multiplier
+        if self.positional_embedding_type == "absolute":
+            raise ValueError("Absolute positional embedding is no longer supported")
+        elif self.positional_embedding_type == "rope":
+            if positional_embedding_theta is None:
+                raise ValueError(
+                    "If `positional_embedding_type` type is rope, `positional_embedding_theta` must also be defined"
+                )
+            if positional_embedding_max_pos is None:
+                raise ValueError(
+                    "If `positional_embedding_type` type is rope, `positional_embedding_max_pos` must also be defined"
+                )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    adaptive_norm=adaptive_norm,
+                    standardization_norm=standardization_norm,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                    use_tpu_flash_attention=use_tpu_flash_attention,
+                    qk_norm=qk_norm,
+                    use_rope=self.use_rope,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, inner_dim) / inner_dim**0.5
+        )
+        self.proj_out = nn.Linear(inner_dim, self.out_channels)
+        self.adaln_single = AdaLayerNormSingle(
+            inner_dim, use_additional_conditions=False
+        )
+        if adaptive_norm == "single_scale":
+            self.adaln_single.linear = nn.Linear(inner_dim, 4 * inner_dim, bias=True)
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=caption_channels, hidden_size=inner_dim
+            )
+        self.gradient_checkpointing = False
+    def set_use_tpu_flash_attention(self):
+        r"""
+        Function sets the flag in this object and propagates down the children. The flag will enforce the usage of TPU
+        attention kernel.
+        """
+        logger.info("ENABLE TPU FLASH ATTENTION -> TRUE")
+        self.use_tpu_flash_attention = True
+        # push config down to the attention modules
+        for block in self.transformer_blocks:
+            block.set_use_tpu_flash_attention()
+    def create_skip_layer_mask(
+        self,
+        batch_size: int,
+        num_conds: int,
+        ptb_index: int,
+        skip_block_list: Optional[List[int]] = None,
+    ):
+        if skip_block_list is None or len(skip_block_list) == 0:
+            return None
+        num_layers = len(self.transformer_blocks)
+        mask = torch.ones(
+            (num_layers, batch_size * num_conds), device=self.device, dtype=self.dtype
+        )
+        for block_idx in skip_block_list:
+            mask[block_idx, ptb_index::num_conds] = 0
+        return mask
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def get_fractional_positions(self, indices_grid):
+        fractional_positions = torch.stack(
+            [
+                indices_grid[:, i] / self.positional_embedding_max_pos[i]
+                for i in range(3)
+            ],
+            dim=-1,
+        )
+        return fractional_positions
+    def precompute_freqs_cis(self, indices_grid, spacing="exp"):
+        dtype = torch.float32  # We need full precision in the freqs_cis computation.
+        dim = self.inner_dim
+        theta = self.positional_embedding_theta
+        fractional_positions = self.get_fractional_positions(indices_grid)
+        start = 1
+        end = theta
+        device = fractional_positions.device
+        if spacing == "exp":
+            indices = theta ** (
+                torch.linspace(
+                    math.log(start, theta),
+                    math.log(end, theta),
+                    dim // 6,
+                    device=device,
+                    dtype=dtype,
+                )
+            )
+            indices = indices.to(dtype=dtype)
+        elif spacing == "exp_2":
+            indices = 1.0 / theta ** (torch.arange(0, dim, 6, device=device) / dim)
+            indices = indices.to(dtype=dtype)
+        elif spacing == "linear":
+            indices = torch.linspace(start, end, dim // 6, device=device, dtype=dtype)
+        elif spacing == "sqrt":
+            indices = torch.linspace(
+                start**2, end**2, dim // 6, device=device, dtype=dtype
+            ).sqrt()
+        indices = indices * math.pi / 2
+        if spacing == "exp_2":
+            freqs = (
+                (indices * fractional_positions.unsqueeze(-1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
+        else:
+            freqs = (
+                (indices * (fractional_positions.unsqueeze(-1) * 2 - 1))
+                .transpose(-1, -2)
+                .flatten(2)
+            )
+        cos_freq = freqs.cos().repeat_interleave(2, dim=-1)
+        sin_freq = freqs.sin().repeat_interleave(2, dim=-1)
+        if dim % 6 != 0:
+            cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6])
+            sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6])
+            cos_freq = torch.cat([cos_padding, cos_freq], dim=-1)
+            sin_freq = torch.cat([sin_padding, sin_freq], dim=-1)
+        return cos_freq.to(self.dtype), sin_freq.to(self.dtype)
+    def load_state_dict(
+        self,
+        state_dict: Dict,
+        *args,
+        **kwargs,
+    ):
+        if any([key.startswith("model.diffusion_model.") for key in state_dict.keys()]):
+            state_dict = {
+                key.replace("model.diffusion_model.", ""): value
+                for key, value in state_dict.items()
+                if key.startswith("model.diffusion_model.")
+            }
+        super().load_state_dict(state_dict, **kwargs)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_path: Optional[Union[str, os.PathLike]],
+        *args,
+        **kwargs,
+    ):
+        pretrained_model_path = Path(pretrained_model_path)
+        if pretrained_model_path.is_dir():
+            config_path = pretrained_model_path / "transformer" / "config.json"
+            with open(config_path, "r") as f:
+                config = make_hashable_key(json.load(f))
+            assert config in diffusers_and_ours_config_mapping, (
+                "Provided diffusers checkpoint config for transformer is not suppported. "
+                "We only support diffusers configs found in Lightricks/LTX-Video."
+            )
+            config = diffusers_and_ours_config_mapping[config]
+            state_dict = {}
+            ckpt_paths = (
+                pretrained_model_path
+                / "transformer"
+                / "diffusion_pytorch_model*.safetensors"
+            )
+            dict_list = glob.glob(str(ckpt_paths))
+            for dict_path in dict_list:
+                part_dict = {}
+                with safe_open(dict_path, framework="pt", device="cpu") as f:
+                    for k in f.keys():
+                        part_dict[k] = f.get_tensor(k)
+                state_dict.update(part_dict)
+            for key in list(state_dict.keys()):
+                new_key = key
+                for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+                    new_key = new_key.replace(replace_key, rename_key)
+                state_dict[new_key] = state_dict.pop(key)
+            with torch.device("meta"):
+                transformer = cls.from_config(config)
+            transformer.load_state_dict(state_dict, assign=True, strict=True)
+        elif pretrained_model_path.is_file() and str(pretrained_model_path).endswith(
+            ".safetensors"
+        ):
+            comfy_single_file_state_dict = {}
+            with safe_open(pretrained_model_path, framework="pt", device="cpu") as f:
+                metadata = f.metadata()
+                for k in f.keys():
+                    comfy_single_file_state_dict[k] = f.get_tensor(k)
+            configs = json.loads(metadata["config"])
+            transformer_config = configs["transformer"]
+            with torch.device("meta"):
+                transformer = Transformer3DModel.from_config(transformer_config)
+            transformer.load_state_dict(comfy_single_file_state_dict, assign=True)
+        return transformer
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        indices_grid: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        skip_layer_mask: Optional[torch.Tensor] = None,
+        skip_layer_strategy: Optional[SkipLayerStrategy] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            indices_grid (`torch.LongTensor` of shape `(batch size, 3, num latent pixels)`):
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            skip_layer_mask ( `torch.Tensor`, *optional*):
+                A mask of shape `(num_layers, batch)` that indicates which layers to skip. `0` at position
+                `layer, batch_idx` indicates that the layer should be skipped for the corresponding batch index.
+            skip_layer_strategy ( `SkipLayerStrategy`, *optional*, defaults to `None`):
+                Controls which layers are skipped when calculating a perturbed latent for spatiotemporal guidance.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # for tpu attention offload 2d token masks are used. No need to transform.
+        if not self.use_tpu_flash_attention:
+            # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+            #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+            #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+            # expects mask of shape:
+            #   [batch, key_tokens]
+            # adds singleton query_tokens dimension:
+            #   [batch,                    1, key_tokens]
+            # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+            #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+            #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+            if attention_mask is not None and attention_mask.ndim == 2:
+                # assume that mask is expressed as:
+                #   (1 = keep,      0 = discard)
+                # convert mask into a bias that can be added to attention scores:
+                #       (keep = +0,     discard = -10000.0)
+                attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+                attention_mask = attention_mask.unsqueeze(1)
+            # convert encoder_attention_mask to a bias the same way we do for attention_mask
+            if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+                encoder_attention_mask = (
+                    1 - encoder_attention_mask.to(hidden_states.dtype)
+                ) * -10000.0
+                encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        hidden_states = self.patchify_proj(hidden_states)
+        if self.timestep_scale_multiplier:
+            timestep = self.timestep_scale_multiplier * timestep
+        freqs_cis = self.precompute_freqs_cis(indices_grid)
+        batch_size = hidden_states.shape[0]
+        timestep, embedded_timestep = self.adaln_single(
+            timestep.flatten(),
+            {"resolution": None, "aspect_ratio": None},
+            batch_size=batch_size,
+            hidden_dtype=hidden_states.dtype,
+        )
+        # Second dimension is 1 or number of tokens (if timestep_per_token)
+        timestep = timestep.view(batch_size, -1, timestep.shape[-1])
+        embedded_timestep = embedded_timestep.view(
+            batch_size, -1, embedded_timestep.shape[-1]
+        )
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, -1, hidden_states.shape[-1]
+            )
+        for block_idx, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    freqs_cis,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    (
+                        skip_layer_mask[block_idx]
+                        if skip_layer_mask is not None
+                        else None
+                    ),
+                    skip_layer_strategy,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    freqs_cis=freqs_cis,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                    skip_layer_mask=(
+                        skip_layer_mask[block_idx]
+                        if skip_layer_mask is not None
+                        else None
+                    ),
+                    skip_layer_strategy=skip_layer_strategy,
+                )
+        # 3. Output
+        scale_shift_values = (
+            self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
+        )
+        shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]
+        hidden_states = self.norm_out(hidden_states)
+        # Modulation
+        hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.proj_out(hidden_states)
+        if not return_dict:
+            return (hidden_states,)
+        return Transformer3DModelOutput(sample=hidden_states)

flash_head/ltx_video/utils/__init__.py ADDED Viewed

File without changes

flash_head/ltx_video/utils/diffusers_config_mapping.py ADDED Viewed

	@@ -0,0 +1,174 @@

+def make_hashable_key(dict_key):
+    def convert_value(value):
+        if isinstance(value, list):
+            return tuple(value)
+        elif isinstance(value, dict):
+            return tuple(sorted((k, convert_value(v)) for k, v in value.items()))
+        else:
+            return value
+    return tuple(sorted((k, convert_value(v)) for k, v in dict_key.items()))
+DIFFUSERS_SCHEDULER_CONFIG = {
+    "_class_name": "FlowMatchEulerDiscreteScheduler",
+    "_diffusers_version": "0.32.0.dev0",
+    "base_image_seq_len": 1024,
+    "base_shift": 0.95,
+    "invert_sigmas": False,
+    "max_image_seq_len": 4096,
+    "max_shift": 2.05,
+    "num_train_timesteps": 1000,
+    "shift": 1.0,
+    "shift_terminal": 0.1,
+    "use_beta_sigmas": False,
+    "use_dynamic_shifting": True,
+    "use_exponential_sigmas": False,
+    "use_karras_sigmas": False,
+}
+DIFFUSERS_TRANSFORMER_CONFIG = {
+    "_class_name": "LTXVideoTransformer3DModel",
+    "_diffusers_version": "0.32.0.dev0",
+    "activation_fn": "gelu-approximate",
+    "attention_bias": True,
+    "attention_head_dim": 64,
+    "attention_out_bias": True,
+    "caption_channels": 4096,
+    "cross_attention_dim": 2048,
+    "in_channels": 128,
+    "norm_elementwise_affine": False,
+    "norm_eps": 1e-06,
+    "num_attention_heads": 32,
+    "num_layers": 28,
+    "out_channels": 128,
+    "patch_size": 1,
+    "patch_size_t": 1,
+    "qk_norm": "rms_norm_across_heads",
+}
+DIFFUSERS_VAE_CONFIG = {
+    "_class_name": "AutoencoderKLLTXVideo",
+    "_diffusers_version": "0.32.0.dev0",
+    "block_out_channels": [128, 256, 512, 512],
+    "decoder_causal": False,
+    "encoder_causal": True,
+    "in_channels": 3,
+    "latent_channels": 128,
+    "layers_per_block": [4, 3, 3, 3, 4],
+    "out_channels": 3,
+    "patch_size": 4,
+    "patch_size_t": 1,
+    "resnet_norm_eps": 1e-06,
+    "scaling_factor": 1.0,
+    "spatio_temporal_scaling": [True, True, True, False],
+}
+OURS_SCHEDULER_CONFIG = {
+    "_class_name": "RectifiedFlowScheduler",
+    "_diffusers_version": "0.25.1",
+    "num_train_timesteps": 1000,
+    "shifting": "SD3",
+    "base_resolution": None,
+    "target_shift_terminal": 0.1,
+}
+OURS_TRANSFORMER_CONFIG = {
+    "_class_name": "Transformer3DModel",
+    "_diffusers_version": "0.25.1",
+    "_name_or_path": "PixArt-alpha/PixArt-XL-2-256x256",
+    "activation_fn": "gelu-approximate",
+    "attention_bias": True,
+    "attention_head_dim": 64,
+    "attention_type": "default",
+    "caption_channels": 4096,
+    "cross_attention_dim": 2048,
+    "double_self_attention": False,
+    "dropout": 0.0,
+    "in_channels": 128,
+    "norm_elementwise_affine": False,
+    "norm_eps": 1e-06,
+    "norm_num_groups": 32,
+    "num_attention_heads": 32,
+    "num_embeds_ada_norm": 1000,
+    "num_layers": 28,
+    "num_vector_embeds": None,
+    "only_cross_attention": False,
+    "out_channels": 128,
+    "project_to_2d_pos": True,
+    "upcast_attention": False,
+    "use_linear_projection": False,
+    "qk_norm": "rms_norm",
+    "standardization_norm": "rms_norm",
+    "positional_embedding_type": "rope",
+    "positional_embedding_theta": 10000.0,
+    "positional_embedding_max_pos": [20, 2048, 2048],
+    "timestep_scale_multiplier": 1000,
+}
+OURS_VAE_CONFIG = {
+    "_class_name": "CausalVideoAutoencoder",
+    "dims": 3,
+    "in_channels": 3,
+    "out_channels": 3,
+    "latent_channels": 128,
+    "blocks": [
+        ["res_x", 4],
+        ["compress_all", 1],
+        ["res_x_y", 1],
+        ["res_x", 3],
+        ["compress_all", 1],
+        ["res_x_y", 1],
+        ["res_x", 3],
+        ["compress_all", 1],
+        ["res_x", 3],
+        ["res_x", 4],
+    ],
+    "scaling_factor": 1.0,
+    "norm_layer": "pixel_norm",
+    "patch_size": 4,
+    "latent_log_var": "uniform",
+    "use_quant_conv": False,
+    "causal_decoder": False,
+}
+diffusers_and_ours_config_mapping = {
+    make_hashable_key(DIFFUSERS_SCHEDULER_CONFIG): OURS_SCHEDULER_CONFIG,
+    make_hashable_key(DIFFUSERS_TRANSFORMER_CONFIG): OURS_TRANSFORMER_CONFIG,
+    make_hashable_key(DIFFUSERS_VAE_CONFIG): OURS_VAE_CONFIG,
+}
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "proj_in": "patchify_proj",
+    "time_embed": "adaln_single",
+    "norm_q": "q_norm",
+    "norm_k": "k_norm",
+}
+VAE_KEYS_RENAME_DICT = {
+    "decoder.up_blocks.3.conv_in": "decoder.up_blocks.7",
+    "decoder.up_blocks.3.upsamplers.0": "decoder.up_blocks.8",
+    "decoder.up_blocks.3": "decoder.up_blocks.9",
+    "decoder.up_blocks.2.upsamplers.0": "decoder.up_blocks.5",
+    "decoder.up_blocks.2.conv_in": "decoder.up_blocks.4",
+    "decoder.up_blocks.2": "decoder.up_blocks.6",
+    "decoder.up_blocks.1.upsamplers.0": "decoder.up_blocks.2",
+    "decoder.up_blocks.1": "decoder.up_blocks.3",
+    "decoder.up_blocks.0": "decoder.up_blocks.1",
+    "decoder.mid_block": "decoder.up_blocks.0",
+    "encoder.down_blocks.3": "encoder.down_blocks.8",
+    "encoder.down_blocks.2.downsamplers.0": "encoder.down_blocks.7",
+    "encoder.down_blocks.2": "encoder.down_blocks.6",
+    "encoder.down_blocks.1.downsamplers.0": "encoder.down_blocks.4",
+    "encoder.down_blocks.1.conv_out": "encoder.down_blocks.5",
+    "encoder.down_blocks.1": "encoder.down_blocks.3",
+    "encoder.down_blocks.0.conv_out": "encoder.down_blocks.2",
+    "encoder.down_blocks.0.downsamplers.0": "encoder.down_blocks.1",
+    "encoder.down_blocks.0": "encoder.down_blocks.0",
+    "encoder.mid_block": "encoder.down_blocks.9",
+    "conv_shortcut.conv": "conv_shortcut",
+    "resnets": "res_blocks",
+    "norm3": "norm3.norm",
+    "latents_mean": "per_channel_statistics.mean-of-means",
+    "latents_std": "per_channel_statistics.std-of-means",
+}

flash_head/ltx_video/utils/prompt_enhance_utils.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import logging
+from typing import Union, List, Optional
+import torch
+from PIL import Image
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+T2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
+Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
+Start directly with the action, and keep descriptions literal and precise.
+Think like a cinematographer describing a shot list.
+Do not change the user input intent, just enhance it.
+Keep within 150 words.
+For best results, build your prompts using this structure:
+Start with main action in a single sentence
+Add specific details about movements and gestures
+Describe character/object appearances precisely
+Include background and environment details
+Specify camera angles and movements
+Describe lighting and colors
+Note any changes or sudden events
+Do not exceed the 150 word limit!
+Output the enhanced prompt only.
+"""
+I2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
+Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
+Start directly with the action, and keep descriptions literal and precise.
+Think like a cinematographer describing a shot list.
+Keep within 150 words.
+For best results, build your prompts using this structure:
+Describe the image first and then add the user input. Image description should be in first priority! Align to the image caption if it contradicts the user text input.
+Start with main action in a single sentence
+Add specific details about movements and gestures
+Describe character/object appearances precisely
+Include background and environment details
+Specify camera angles and movements
+Describe lighting and colors
+Note any changes or sudden events
+Align to the image caption if it contradicts the user text input.
+Do not exceed the 150 word limit!
+Output the enhanced prompt only.
+"""
+def tensor_to_pil(tensor):
+    # Ensure tensor is in range [-1, 1]
+    assert tensor.min() >= -1 and tensor.max() <= 1
+    # Convert from [-1, 1] to [0, 1]
+    tensor = (tensor + 1) / 2
+    # Rearrange from [C, H, W] to [H, W, C]
+    tensor = tensor.permute(1, 2, 0)
+    # Convert to numpy array and then to uint8 range [0, 255]
+    numpy_image = (tensor.cpu().numpy() * 255).astype("uint8")
+    # Convert to PIL Image
+    return Image.fromarray(numpy_image)
+def generate_cinematic_prompt(
+    image_caption_model,
+    image_caption_processor,
+    prompt_enhancer_model,
+    prompt_enhancer_tokenizer,
+    prompt: Union[str, List[str]],
+    conditioning_items: Optional[List] = None,
+    max_new_tokens: int = 256,
+) -> List[str]:
+    prompts = [prompt] if isinstance(prompt, str) else prompt
+    if conditioning_items is None:
+        prompts = _generate_t2v_prompt(
+            prompt_enhancer_model,
+            prompt_enhancer_tokenizer,
+            prompts,
+            max_new_tokens,
+            T2V_CINEMATIC_PROMPT,
+        )
+    else:
+        if len(conditioning_items) > 1 or conditioning_items[0].media_frame_number != 0:
+            logger.warning(
+                "prompt enhancement does only support unconditional or first frame of conditioning items, returning original prompts"
+            )
+            return prompts
+        first_frame_conditioning_item = conditioning_items[0]
+        first_frames = _get_first_frames_from_conditioning_item(
+            first_frame_conditioning_item
+        )
+        assert len(first_frames) == len(
+            prompts
+        ), "Number of conditioning frames must match number of prompts"
+        prompts = _generate_i2v_prompt(
+            image_caption_model,
+            image_caption_processor,
+            prompt_enhancer_model,
+            prompt_enhancer_tokenizer,
+            prompts,
+            first_frames,
+            max_new_tokens,
+            I2V_CINEMATIC_PROMPT,
+        )
+    return prompts
+def _get_first_frames_from_conditioning_item(conditioning_item) -> List[Image.Image]:
+    frames_tensor = conditioning_item.media_item
+    return [
+        tensor_to_pil(frames_tensor[i, :, 0, :, :])
+        for i in range(frames_tensor.shape[0])
+    ]
+def _generate_t2v_prompt(
+    prompt_enhancer_model,
+    prompt_enhancer_tokenizer,
+    prompts: List[str],
+    max_new_tokens: int,
+    system_prompt: str,
+) -> List[str]:
+    messages = [
+        [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"user_prompt: {p}"},
+        ]
+        for p in prompts
+    ]
+    texts = [
+        prompt_enhancer_tokenizer.apply_chat_template(
+            m, tokenize=False, add_generation_prompt=True
+        )
+        for m in messages
+    ]
+    model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
+        prompt_enhancer_model.device
+    )
+    return _generate_and_decode_prompts(
+        prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
+    )
+def _generate_i2v_prompt(
+    image_caption_model,
+    image_caption_processor,
+    prompt_enhancer_model,
+    prompt_enhancer_tokenizer,
+    prompts: List[str],
+    first_frames: List[Image.Image],
+    max_new_tokens: int,
+    system_prompt: str,
+) -> List[str]:
+    image_captions = _generate_image_captions(
+        image_caption_model, image_caption_processor, first_frames
+    )
+    messages = [
+        [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"user_prompt: {p}\nimage_caption: {c}"},
+        ]
+        for p, c in zip(prompts, image_captions)
+    ]
+    texts = [
+        prompt_enhancer_tokenizer.apply_chat_template(
+            m, tokenize=False, add_generation_prompt=True
+        )
+        for m in messages
+    ]
+    model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
+        prompt_enhancer_model.device
+    )
+    return _generate_and_decode_prompts(
+        prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
+    )
+def _generate_image_captions(
+    image_caption_model,
+    image_caption_processor,
+    images: List[Image.Image],
+    system_prompt: str = "<DETAILED_CAPTION>",
+) -> List[str]:
+    image_caption_prompts = [system_prompt] * len(images)
+    inputs = image_caption_processor(
+        image_caption_prompts, images, return_tensors="pt"
+    ).to(image_caption_model.device)
+    with torch.inference_mode():
+        generated_ids = image_caption_model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            do_sample=False,
+            num_beams=3,
+        )
+    return image_caption_processor.batch_decode(generated_ids, skip_special_tokens=True)
+def _generate_and_decode_prompts(
+    prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens: int
+) -> List[str]:
+    with torch.inference_mode():
+        outputs = prompt_enhancer_model.generate(
+            **model_inputs, max_new_tokens=max_new_tokens
+        )
+        generated_ids = [
+            output_ids[len(input_ids) :]
+            for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
+        ]
+        decoded_prompts = prompt_enhancer_tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+    return decoded_prompts

flash_head/ltx_video/utils/skip_layer_strategy.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from enum import Enum, auto
+class SkipLayerStrategy(Enum):
+    AttentionSkip = auto()
+    AttentionValues = auto()
+    Residual = auto()
+    TransformerBlock = auto()

flash_head/ltx_video/utils/torch_utils.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+from torch import nn
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    elif dims_to_append == 0:
+        return x
+    return x[(...,) + (None,) * dims_to_append]
+class Identity(nn.Module):
+    """A placeholder identity operator that is argument-insensitive."""
+    def __init__(self, *args, **kwargs) -> None:  # pylint: disable=unused-argument
+        super().__init__()
+    # pylint: disable=unused-argument
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return x

flash_head/src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

flash_head/src/distributed/usp_device.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import math
+from loguru import logger
+import datetime
+import torch
+import torch.distributed as dist
+def get_parallel_degree(world_size, num_heads):
+    # ulysses_degree is faster, and must be a divisor of num_heads
+    ulysses_degree = math.gcd(world_size, num_heads)
+    ring_degree = world_size // ulysses_degree
+    return ulysses_degree, ring_degree
+def get_device(ulysses_degree, ring_degree):
+    if ulysses_degree > 1 or ring_degree > 1:
+        from xfuser.core.distributed import (
+            init_distributed_environment,
+            initialize_model_parallel,
+            get_world_group,
+        )
+        dist.init_process_group("nccl", timeout=datetime.timedelta(hours=24*7))
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=ring_degree,
+            ulysses_degree=ulysses_degree
+        )
+        device = torch.device(f"cuda:{get_world_group().rank}")
+        torch.cuda.set_device(get_world_group().rank)
+        logger.info(f'rank={get_world_group().rank} device={str(device)}')
+    else:
+        device = "cuda"
+    return device

flash_head/src/modules/flash_head_model.py ADDED Viewed

	@@ -0,0 +1,548 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Tuple, Optional
+from einops import rearrange
+from diffusers import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+import torch.cuda.amp as amp
+import torch.distributed as dist
+from xfuser.core.distributed import (
+    get_sequence_parallel_rank,
+    get_sequence_parallel_world_size,
+    get_sp_group,
+)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+def flash_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, num_heads: int, compatibility_mode=False):
+    if compatibility_mode:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    elif SAGE_ATTN_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = sageattn(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_3_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn_interface.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_2_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    else:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    return x
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(position.type(torch.float64), torch.pow(
+        10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def precompute_freqs_cis_3d(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 3d rope precompute
+    f_freqs_cis = precompute_freqs_cis(dim - 2 * (dim // 3), end, theta)
+    h_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    w_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    return torch.cat([f_freqs_cis, h_freqs_cis, w_freqs_cis], dim=1)
+def precompute_freqs_cis(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 1d rope precompute
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)
+                   [: (dim // 2)].double() / dim))
+    freqs = torch.outer(torch.arange(end, device=freqs.device), freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+def rope_apply(x, freqs, grid_sizes, use_usp=False, sp_size=1, sp_rank=0):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1) # [[N, head_dim/2], [N, head_dim/2], [N, head_dim/2]] # T H W 极坐标
+    # loop over samples
+    (f, h, w) = grid_sizes
+    seq_len = f * h * w
+    # precompute multipliers
+    x_i = torch.view_as_complex(x[0, :s].to(torch.float64).reshape(
+        s, n, -1, 2)) # [L, N, C/2] # 极坐标
+    freqs_i = torch.cat([
+        freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+        freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+        freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+    ],
+                        dim=-1).reshape(seq_len, 1, -1) # seq_lens, 1,  3 * dim / 2 (T H W)
+    if use_usp:
+        # apply rotary embedding
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                        s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[0, s:]])
+    else:
+        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
+        x_i = torch.cat([x_i, x[0, seq_len:]])
+    return x_i.unsqueeze(0).to(x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        dtype = x.dtype
+        return self.norm(x.float()).to(dtype) * self.weight
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.use_usp = dist.is_initialized()
+        self.sp_size = get_sequence_parallel_world_size() if self.use_usp else 1
+        self.sp_rank = get_sequence_parallel_rank() if self.use_usp else 0
+    def forward(self, x, freqs, grid_sizes):
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x)
+        if self.use_usp:
+            from yunchang.kernels import AttnType
+            if SAGE_ATTN_AVAILABLE:
+                attn_type = AttnType.SAGE_AUTO
+            else:
+                attn_type = AttnType.FA
+            x = xFuserLongContextAttention(attn_type=attn_type)(
+                None,
+                query=rope_apply(q, freqs, grid_sizes, self.use_usp, self.sp_size, self.sp_rank),
+                key=rope_apply(k, freqs, grid_sizes, self.use_usp, self.sp_size, self.sp_rank),
+                value=v.view(b, s, n, d),
+            ).flatten(2)
+        else:
+            x = flash_attention(
+                q=rope_apply(q, freqs, grid_sizes).flatten(2),
+                k=rope_apply(k, freqs, grid_sizes).flatten(2),
+                v=v,
+                num_heads=self.num_heads
+            )
+        return self.o(x)
+class CrossAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6, has_image_input: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim)
+            self.v_img = nn.Linear(dim, dim)
+            self.norm_k_img = RMSNorm(dim, eps=eps)
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        x = flash_attention(q, k, v, num_heads=self.num_heads)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = flash_attention(q, k_img, v_img, num_heads=self.num_heads)
+            x = x + y
+        return self.o(x)
+class DiTAudioBlock(nn.Module):
+    def __init__(self, has_image_input: bool, dim: int, num_heads: int, ffn_dim: int, eps: float = 1e-6, i=0, num_layers=0):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.i = i
+        self.num_layers = num_layers
+        self.self_attn = SelfAttention(dim, num_heads, eps)
+        self.cross_attn = CrossAttention(
+            dim, num_heads, eps, has_image_input=has_image_input)
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(nn.Linear(dim, ffn_dim), nn.GELU(
+            approximate='tanh'), nn.Linear(ffn_dim, dim))
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.use_usp = dist.is_initialized()
+        self.sp_size = get_sequence_parallel_world_size() if self.use_usp else 1
+        self.sp_rank = get_sequence_parallel_rank() if self.use_usp else 0
+    def forward(self, x, context, t_mod, freqs, grid_sizes):
+        e = (self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod).chunk(6, dim=1)
+        y = self.self_attn(
+            self.norm1(x) * (1 + e[1]) + e[0], freqs, grid_sizes)
+        x = x + y * e[2]
+        x_1 = rearrange(self.norm3(x), 'b (f l) c -> (b f) l c', f=context.shape[1])
+        context_1 = context.squeeze(0)
+        if self.use_usp:
+            context_1 = context_1.unsqueeze(1).repeat(1, self.sp_size, 1, 1).flatten(0,1)
+            context_1 = torch.chunk(context_1, self.sp_size, dim=0)[self.sp_rank]
+        x = x + self.cross_attn(x_1, context_1).flatten(0, 1).unsqueeze(0)
+        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
+        x = x + y * e[5]
+        return x
+class MLP(torch.nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, in_dim),
+            nn.GELU(),
+            nn.Linear(in_dim, out_dim),
+            nn.LayerNorm(out_dim)
+        )
+    def forward(self, x):
+        return self.proj(x)
+class Head(nn.Module):
+    def __init__(self, dim: int, out_dim: int, patch_size: Tuple[int, int, int], eps: float):
+        super().__init__()
+        self.dim = dim
+        self.patch_size = patch_size
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.head = nn.Linear(dim, out_dim * math.prod(patch_size))
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, t_mod):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            t_mod(Tensor): Shape [B*21, C]
+        """
+        B, L, D = x.shape
+        F = t_mod.shape[0] // B
+        shift, scale = (self.modulation.to(dtype=t_mod.dtype, device=t_mod.device).unsqueeze(1) + t_mod.unflatten(dim=0, sizes=(B, t_mod.shape[0]//B)).unsqueeze(2)).chunk(2, dim=2)
+        x = rearrange(x, 'b (f l) d -> b f l d', f=F)
+        x = (self.head(self.norm(x) * (1 + scale) + shift))
+        x = rearrange(x, 'b f l d -> b (f l) d')
+        return x
+class WanModelAudioProject(ModelMixin, ConfigMixin):
+    _no_split_modules = ['DiTAudioBlock']
+    @register_to_config
+    def __init__(
+        self,
+        dim: int,
+        in_dim: int,
+        ffn_dim: int,
+        out_dim: int,
+        text_dim: int,
+        freq_dim: int,
+        eps: float,
+        vae_stride: Tuple[int, int, int],
+        patch_size: Tuple[int, int, int],
+        num_heads: int,
+        num_layers: int,
+        has_image_input: bool,
+        **kwargs,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freq_dim = freq_dim
+        self.has_image_input = has_image_input
+        self.patch_size = patch_size
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size)
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim),
+            nn.GELU(approximate='tanh'),
+            nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(
+            nn.SiLU(), nn.Linear(dim, dim * 6))
+        self.blocks = nn.ModuleList([
+            DiTAudioBlock(has_image_input, dim, num_heads, ffn_dim, eps, i, num_layers)
+            for i in range(num_layers)
+        ])
+        self.head = Head(dim, out_dim, patch_size, eps)
+        head_dim = dim // num_heads
+        self.freqs = precompute_freqs_cis_3d(head_dim)
+        self.audio_emb = MLP(768, dim)
+        if has_image_input:
+            self.img_emb = MLP(1280, dim)
+        # init audio adapter
+        audio_window = 5
+        vae_scale = vae_stride[0]
+        intermediate_dim = 512
+        output_dim = 1536
+        context_tokens = 32
+        norm_output_audio = True
+        self.audio_window = audio_window
+        self.vae_scale = vae_scale
+        self.audio_proj = AudioProjModel(
+                    seq_len=audio_window,
+                    seq_len_vf=audio_window+vae_scale-1,
+                    intermediate_dim=intermediate_dim,
+                    output_dim=output_dim,
+                    context_tokens=context_tokens,
+                    norm_output_audio=norm_output_audio,
+                )
+        self.use_usp = dist.is_initialized()
+        self.sp_size = get_sequence_parallel_world_size() if self.use_usp else 1
+        self.sp_rank = get_sequence_parallel_rank() if self.use_usp else 0
+    def patchify(self, x: torch.Tensor):
+        x = self.patch_embedding(x)
+        grid_size = x.shape[2:]
+        x = rearrange(x, 'b c f h w -> b (f h w) c').contiguous()
+        return x, grid_size  # x, grid_size: (f, h, w)
+    def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
+        return rearrange(
+            x, 'b (f h w) (x y z c) -> b c (f x) (h y) (w z)',
+            f=grid_size[0], h=grid_size[1], w=grid_size[2],
+            x=self.patch_size[0], y=self.patch_size[1], z=self.patch_size[2]
+        )
+    def forward(self,
+                x: torch.Tensor,  #(1, 16, 9, 64, 64))
+                timestep: torch.Tensor, #(9,)
+                context: torch.Tensor, #(5, 33, 12, 768)
+                y: Optional[torch.Tensor] = None, #(1, 16, 9, 64, 64)
+                use_gradient_checkpointing: bool = False,
+                use_gradient_checkpointing_offload: bool = False,
+                **kwargs,
+                ):
+        if self.freqs.device != x.device:
+            self.freqs = self.freqs.to(x.device)
+        x = torch.cat([x, y], dim=1) # (1, 32, 9, 64, 64)
+        x, grid_sizes = self.patchify(x)
+        t = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, timestep.to(dtype=x.dtype)))
+        t_mod = self.time_projection(t).unflatten(1, (6, self.dim))   # (bsz, 6, 1536)
+        # ==================== 音频条件处理 ====================
+        # 输入: context (bsz, 81, 5, 12, 768)
+        # - 81 帧 = 1 (第一帧) + 80 (后续帧, 每4帧对应VAE压缩后的1帧)
+        # - 5 是音频窗口大小 (audio_window)
+        # - 12 是音频特征的 blocks
+        # - 768 是音频特征维度
+        audio_cond = context.to(device=x.device, dtype=x.dtype)
+        # 1. 第一帧：直接使用完整的5帧音频窗口
+        first_frame_audio = audio_cond[:, :1, ...]  # (bsz, 1, 5, 12, 768)
+        # 2. 后续帧：需要根据帧位置选择不同的音频窗口
+        # 将 32 帧重排为 (8 个 VAE latent, 每个4帧)
+        latter_frames_audio = rearrange(
+            audio_cond[:, 1:, ...],
+            "b (n_latent n_frame) w s c -> b n_latent n_frame w s c",
+            n_frame=self.vae_scale  # vae_scale=4
+        )  # (bsz, 8, 4, 5, 12, 768)
+        mid_idx = self.audio_window // 2  # 窗口中心索引: 5//2=2
+        # 为每个 latent 的4帧选择合适的音频窗口:
+        # - 第1帧 (帧索引0): 无过去，取前3帧窗口 [:mid_idx+1] = [:3]
+        # - 中间帧 (帧索引1-2): 取中心1帧 [mid_idx:mid_idx+1] = [2:3]
+        # - 第4帧 (帧索引3): 无未来，取后3帧窗口 [mid_idx:] = [2:]
+        first_of_group = latter_frames_audio[:, :, :1, :mid_idx+1, ...]  # (bsz, 8, 1, 3, 12, 768)
+        middle_of_group = latter_frames_audio[:, :, 1:-1, mid_idx:mid_idx+1, ...]  # (bsz, 8, 2, 1, 12, 768)
+        last_of_group = latter_frames_audio[:, :, -1:, mid_idx:, ...]  # (bsz, 8, 1, 3, 12, 768)
+        # 合并并展平窗口维度: (n_frame, window) -> (n_frame * window)
+        latter_frames_audio_processed = torch.cat([
+            rearrange(first_of_group, "b n_latent n_f w s c -> b n_latent (n_f w) s c"),
+            rearrange(middle_of_group, "b n_latent n_f w s c -> b n_latent (n_f w) s c"),
+            rearrange(last_of_group, "b n_latent n_f w s c -> b n_latent (n_f w) s c"),
+        ], dim=2)  # (bsz, 8, 1*3 + 2*1 + 1*3, 12, 768) = (bsz, 8, 8, 12, 768)
+        # 3. 通过 AudioProjModel 投影到 DiT 所需的特征空间
+        context = self.audio_proj(
+            first_frame_audio,
+            latter_frames_audio_processed
+        ).to(x.dtype)  # (bsz, 9, 32, 1536)
+        if self.use_usp:
+            x = torch.chunk(x, self.sp_size, dim=1)[self.sp_rank]
+        for block in self.blocks:
+            x = block(x, context, t_mod, self.freqs, grid_sizes)
+        x = self.head(x, t)   # (bsz, 9*32*32, 64)
+        if self.use_usp:
+            x = get_sp_group().all_gather(x, dim=1)
+        x = self.unpatchify(x, grid_sizes)  # (bsz, 16, 21, 64, 64)
+        return x
+class AudioProjModel(ModelMixin, ConfigMixin):
+    def __init__(
+        self,
+        seq_len=5,
+        seq_len_vf=12,
+        blocks=12,
+        channels=768,
+        intermediate_dim=512,
+        output_dim=768,
+        context_tokens=32,
+        norm_output_audio=False,
+    ):
+        super().__init__()
+        self.seq_len = seq_len
+        self.blocks = blocks
+        self.channels = channels
+        self.input_dim = seq_len * blocks * channels
+        self.input_dim_vf = seq_len_vf * blocks * channels
+        self.intermediate_dim = intermediate_dim
+        self.context_tokens = context_tokens
+        self.output_dim = output_dim
+        # define multiple linear layers
+        self.proj1 = nn.Linear(self.input_dim, intermediate_dim)
+        self.proj1_vf = nn.Linear(self.input_dim_vf, intermediate_dim)
+        self.proj2 = nn.Linear(intermediate_dim, intermediate_dim)
+        self.proj3 = nn.Linear(intermediate_dim, context_tokens * output_dim)
+        self.norm = nn.LayerNorm(output_dim) if norm_output_audio else nn.Identity()
+    def forward(self, audio_embeds, audio_embeds_vf):
+        video_length = audio_embeds.shape[1] + audio_embeds_vf.shape[1]
+        B, _, _, S, C = audio_embeds.shape
+        # process audio of first frame
+        audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c")
+        batch_size, window_size, blocks, channels = audio_embeds.shape
+        audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels)
+        # process audio of latter frame
+        audio_embeds_vf = rearrange(audio_embeds_vf, "bz f w b c -> (bz f) w b c")
+        batch_size_vf, window_size_vf, blocks_vf, channels_vf = audio_embeds_vf.shape
+        audio_embeds_vf = audio_embeds_vf.view(batch_size_vf, window_size_vf * blocks_vf * channels_vf)
+        # first projection
+        audio_embeds = torch.relu(self.proj1(audio_embeds))
+        audio_embeds_vf = torch.relu(self.proj1_vf(audio_embeds_vf))
+        audio_embeds = rearrange(audio_embeds, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_vf = rearrange(audio_embeds_vf, "(bz f) c -> bz f c", bz=B)
+        audio_embeds_c = torch.concat([audio_embeds, audio_embeds_vf], dim=1)
+        batch_size_c, N_t, C_a = audio_embeds_c.shape
+        audio_embeds_c = audio_embeds_c.view(batch_size_c*N_t, C_a)
+        # second projection
+        audio_embeds_c = torch.relu(self.proj2(audio_embeds_c))
+        context_tokens = self.proj3(audio_embeds_c).reshape(batch_size_c*N_t, self.context_tokens, self.output_dim)
+        # normalization and reshape
+        with amp.autocast(dtype=torch.float32):
+            context_tokens = self.norm(context_tokens)
+        context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length)
+        return context_tokens

flash_head/src/pipeline/flash_head_pipeline.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import os
+from PIL import Image
+from loguru import logger
+import time
+import numpy as np
+import torch
+import torch.distributed as dist
+from einops import rearrange
+from transformers import Wav2Vec2FeatureExtractor
+from flash_head.src.modules.flash_head_model import WanModelAudioProject
+from flash_head.audio_analysis.wav2vec2 import Wav2Vec2Model
+from flash_head.utils.utils import match_and_blend_colors_torch, resize_and_centercrop
+from flash_head.utils.facecrop import process_image
+# compile models to speedup inference
+COMPILE_MODEL = False
+COMPILE_VAE = False
+# use parallel vae to speedup decode/encode, only support WanVAE
+USE_PARALLEL_VAE = True
+def get_cond_image_dict(cond_image_path_or_dir, use_face_crop):
+    def get_image(cond_image_path, use_face_crop):
+        if use_face_crop:
+            try:
+                image = process_image(cond_image_path)
+                return image
+            except Exception as e:
+                logger.error(f"Error processing {cond_image_path}: {e}")
+        return Image.open(cond_image_path).convert("RGB")
+    if os.path.isdir(cond_image_path_or_dir):
+        import glob
+        cond_image_list = glob.glob(os.path.join(cond_image_path_or_dir, "*.png"))
+        cond_image_list.sort()
+        cond_image_dict = {cond_image.split("/")[-1].split(".")[0]: get_image(cond_image, use_face_crop) for cond_image in cond_image_list}
+    else:
+        cond_image_dict = {cond_image_path_or_dir.split("/")[-1].split(".")[0]: get_image(cond_image_path_or_dir, use_face_crop)}
+    return cond_image_dict
+def timestep_transform(
+    t,
+    shift=5.0,
+    num_timesteps=1000,
+):
+    t = t / num_timesteps
+    # shift the timestep based on ratio
+    new_t = shift * t / (1 + (shift - 1) * t)
+    new_t = new_t * num_timesteps
+    return new_t
+class FlashHeadPipeline:
+    def __init__(
+        self,
+        checkpoint_dir,
+        model_type,
+        wav2vec_dir,
+        device="cuda",
+        param_dtype=torch.bfloat16,
+        use_usp=False,
+        num_timesteps=1000,
+        use_timestep_transform=True,
+    ):
+        r"""
+        Initializes the image-to-video generation model components.
+        Args:
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            wav2vec_dir (`str`):
+                Path to directory containing wav2vec checkpoints
+            use_usp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of USP.
+        """
+        self.param_dtype = param_dtype
+        self.device = device
+        self.rank = dist.get_rank() if dist.is_initialized() else 0
+        self.use_usp = use_usp and dist.is_initialized()
+        self.model_type = model_type
+        self.use_ltx = model_type == "lite"
+        if self.use_ltx:
+            model_dir = os.path.join(checkpoint_dir, "Model_Lite")
+            vae_dir = os.path.join(checkpoint_dir, "VAE_LTX")
+            from flash_head.ltx_video.ltx_vae import LtxVAE
+            self.vae = LtxVAE(
+                pretrained_model_type_or_path=vae_dir,
+                dtype=self.param_dtype,
+                device=self.device,
+            )
+        else:
+            vae_path = os.path.join(checkpoint_dir, "VAE_Wan/Wan2.1_VAE.pth")
+            from flash_head.wan.modules import WanVAE
+            self.vae = WanVAE(
+                vae_path=vae_path,
+                dtype=self.param_dtype,
+                device=self.device,
+                parallel=(USE_PARALLEL_VAE and self.use_usp),
+            )
+            if self.model_type == "pretrained":
+                self.audio_guide_scale = 3.0
+                model_dir = os.path.join(checkpoint_dir, "teacher")
+            elif self.model_type == "pro":
+                model_dir = os.path.join(checkpoint_dir, "Model_Pro")
+        self.model = WanModelAudioProject.from_pretrained(model_dir)
+        self.model.eval().requires_grad_(False)
+        self.model.to(device=self.device, dtype=self.param_dtype)
+        self.config = self.model.config
+        if use_usp:
+            from xfuser.core.distributed import get_sequence_parallel_world_size
+            self.sp_size = get_sequence_parallel_world_size()
+        else:
+            self.sp_size = 1
+        if dist.is_initialized():
+            dist.barrier()
+        self.num_timesteps = num_timesteps
+        self.use_timestep_transform = use_timestep_transform
+        if COMPILE_MODEL:
+            self.model = torch.compile(self.model)
+        if COMPILE_VAE:
+            if self.use_ltx:
+                self.vae.model.encode = torch.compile(self.vae.model.encode)
+                self.vae.model.decode = torch.compile(self.vae.model.decode)
+            else:
+                self.vae.encode = torch.compile(self.vae.encode)
+                self.vae.decode = torch.compile(self.vae.decode)
+        self.audio_encoder = Wav2Vec2Model.from_pretrained(wav2vec_dir, local_files_only=True).to(self.device)
+        self.audio_encoder.feature_extractor._freeze_parameters()
+        self.wav2vec_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_dir, local_files_only=True)
+    @torch.no_grad()
+    def prepare_params(self,
+                        cond_image_path_or_dir,
+                        target_size,
+                        frame_num,
+                        motion_frames_num,
+                        sampling_steps,
+                        seed=None,
+                        shift=5.0,
+                        color_correction_strength=0.0,
+                        use_face_crop=False,
+                        ):
+        self.cond_image_dict = get_cond_image_dict(cond_image_path_or_dir, use_face_crop)
+        self.frame_num = frame_num
+        self.motion_frames_num = motion_frames_num
+        self.color_correction_strength = color_correction_strength
+        self.target_h, self.target_w = target_size
+        self.lat_h, self.lat_w = self.target_h // self.config.vae_stride[1], self.target_w // self.config.vae_stride[2]
+        self.generator = torch.Generator(device=self.device).manual_seed(seed)
+        # prepare timesteps
+        if sampling_steps == 2:
+            timesteps = [1000, 500]
+        elif sampling_steps == 4:
+            timesteps = [1000, 750, 500, 250]
+        else:
+            timesteps = list(np.linspace(self.num_timesteps, 1, sampling_steps, dtype=np.float32))
+        timesteps.append(0.)
+        timesteps = [torch.tensor([t], device=self.device) for t in timesteps]
+        if self.use_timestep_transform:
+            timesteps = [timestep_transform(t, shift=shift, num_timesteps=self.num_timesteps) for t in timesteps]
+        self.timesteps = timesteps
+        self.cond_image_tensor_dict = {}
+        self.ref_img_latent_dict = {}
+        for i, (person_name, cond_image_pil) in enumerate(self.cond_image_dict.items()):
+            cond_image_tensor = resize_and_centercrop(cond_image_pil, (self.target_h, self.target_w)).to(self.device, dtype=self.param_dtype) # 1 C 1 H W
+            cond_image_tensor = (cond_image_tensor / 255 - 0.5) * 2
+            self.cond_image_tensor_dict[person_name] = cond_image_tensor
+            video_frames = cond_image_tensor.repeat(1, 1, self.frame_num, 1, 1)
+            self.ref_img_latent_dict[person_name] = self.vae.encode(video_frames) # (16, 9, 64, 64) / (128, 5, 16, 16)
+            if i == 0:
+                self.reset_person_name(person_name)
+        return
+    @torch.no_grad()
+    def reset_person_name(self, person_name=None):
+        if person_name is None or person_name not in self.cond_image_dict:
+            pass
+        else:
+            self.person_name = person_name
+        self.original_color_reference = self.cond_image_tensor_dict[self.person_name]
+        self.ref_img_latent = self.ref_img_latent_dict[self.person_name]
+        self.latent_motion_frames = self.ref_img_latent[:, :1].clone()
+    @torch.no_grad()
+    def preprocess_audio(self, speech_array, sr=16000, fps=25):
+        video_length = len(speech_array) * fps / sr
+        # wav2vec_feature_extractor
+        audio_feature = np.squeeze(
+            self.wav2vec_feature_extractor(speech_array, sampling_rate=sr).input_values
+        )
+        audio_feature = torch.from_numpy(audio_feature).float().to(device=self.device)
+        audio_feature = audio_feature.unsqueeze(0)
+        # audio encoder
+        with torch.no_grad():
+            embeddings = self.audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
+        if len(embeddings) == 0:
+            logger.error("Fail to extract audio embedding")
+            return None
+        audio_emb = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
+        audio_emb = rearrange(audio_emb, "b s d -> s b d")
+        return audio_emb
+    @torch.no_grad()
+    def generate(self, audio_embedding):
+        # evaluation mode
+        with torch.no_grad():
+            # sample videos
+            noise = torch.randn(
+                self.config.out_dim,
+                (self.frame_num - 1) // self.config.vae_stride[0] + 1,
+                self.lat_h,
+                self.lat_w,
+                dtype=self.param_dtype,
+                device=self.device,
+                generator=self.generator)
+            for i in range(len(self.timesteps)-1):
+                torch.cuda.synchronize()
+                start_time = time.time()
+                noise[:, :self.latent_motion_frames.shape[1]] = self.latent_motion_frames
+                flow_pred = self.model(
+                    x=noise.unsqueeze(0),
+                    timestep=self.timesteps[i],
+                    context=audio_embedding,
+                    y=self.ref_img_latent.unsqueeze(0),
+                )[0]
+                if self.model_type == "pretrained":
+                    flow_pred_drop_audio = self.model(
+                        x=noise.unsqueeze(0),
+                        timestep=self.timesteps[i],
+                        context=torch.zeros_like(audio_embedding),
+                        y=self.ref_img_latent.unsqueeze(0),
+                    )[0]
+                    flow_pred = flow_pred_drop_audio + self.audio_guide_scale * (flow_pred - flow_pred_drop_audio)
+                    # update latent
+                    dt = self.timesteps[i] - self.timesteps[i + 1]
+                    dt = (dt / self.num_timesteps).to(self.param_dtype)
+                    noise = noise - flow_pred * dt[:, None, None, None]
+                else:
+                    # update latent
+                    t_i = (self.timesteps[i][:, None, None, None] / self.num_timesteps).to(self.param_dtype)
+                    t_i_1 = (self.timesteps[i+1][:, None, None, None] / self.num_timesteps).to(self.param_dtype)
+                    x_0 = noise - flow_pred * t_i
+                    noise = (1 - t_i_1) * x_0 + t_i_1 * torch.randn(x_0.size(), dtype=x_0.dtype, device=self.device, generator=self.generator)
+                torch.cuda.synchronize()
+                end_time = time.time()
+                if self.rank == 0:
+                    print(f'[generate] model denoise per step: {end_time - start_time}s')
+            noise[:, :self.latent_motion_frames.shape[1]] = self.latent_motion_frames
+            torch.cuda.synchronize()
+            start_decode_time = time.time()
+            videos = self.vae.decode(noise)
+            torch.cuda.synchronize()
+            end_decode_time = time.time()
+            if self.rank == 0:
+                print(f'[generate] decode video frames: {end_decode_time - start_decode_time}s')
+        torch.cuda.synchronize()
+        start_color_correction_time = time.time()
+        if self.color_correction_strength > 0.0:
+            videos = match_and_blend_colors_torch(videos, self.original_color_reference, self.color_correction_strength)
+        cond_frame = videos[:, :, -self.motion_frames_num:].to(self.device)
+        torch.cuda.synchronize()
+        end_color_correction_time = time.time()
+        if self.rank == 0:
+            print(f'[generate] color correction: {end_color_correction_time - start_color_correction_time}s')
+        torch.cuda.synchronize()
+        start_encode_time = time.time()
+        self.latent_motion_frames = self.vae.encode(cond_frame)
+        torch.cuda.synchronize()
+        end_encode_time = time.time()
+        if self.rank == 0:
+            print(f'[generate] encode motion frames: {end_encode_time - start_encode_time}s')
+        gen_video_samples = videos #[:, :, self.motion_frames_num:]
+        return gen_video_samples[0].to(torch.float32)

flash_head/utils/cpu_face_handler.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import mediapipe as mp
+import numpy as np
+from typing import Tuple, List
+class CPUFaceHandler:
+    """Handler for CPU-based face detection using MediaPipe.
+    (2 ms/frame)
+    This handler provides a simple interface for face detection using MediaPipe's
+    face detection model. It's optimized for CPU usage and provides basic face
+    detection functionality.
+    """
+    def __init__(self, model_selection: int = 1, min_detection_confidence: float = 0.0):
+        """Initialize the face detection handler."""
+        self.detector = mp.solutions.face_detection.FaceDetection(
+            model_selection=model_selection,
+            min_detection_confidence=min_detection_confidence,
+        )
+    def detect(self, image: np.ndarray) -> Tuple[int, List[int]]:
+        """Detect faces in the given image.
+        Args:
+            image (np.ndarray): RGB image array.
+        Returns:
+            Tuple[int, List[int]]: A tuple containing:
+                - Number of faces detected (int)
+                - Bounding box coordinates [x1, y1, x2, y2] if exactly one face is detected,
+                  empty list otherwise
+        """
+        bboxs, scores = [], []
+        results = self.detector.process(image)
+        detection_result = results.detections
+        if detection_result is None:
+            return bboxs, scores
+        for detection in detection_result:
+            bboxC = detection.location_data.relative_bounding_box
+            x, y, w, h = bboxC.xmin, bboxC.ymin, bboxC.width, bboxC.height
+            x1, y1, x2, y2 = x, y, x + w, y + h
+            bboxs.append([x1, y1, x2, y2])
+            scores.append(detection.score[0])
+        return bboxs, scores
+    def __call__(self, image: np.ndarray) -> Tuple[int, List[int]]:
+        """Make the handler callable.
+        Args:
+            image (np.ndarray): RGB image array.
+        Returns:
+            Tuple[int, List[int]]: Same as detect() method.
+        """
+        return self.detect(image)

flash_head/utils/facecrop.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python3
+"""
+人脸裁剪处理脚本
+从单张图像中检测人脸，裁剪并调整大小到指定尺寸
+"""
+import os
+from PIL import Image
+import numpy as np
+from flash_head.utils.cpu_face_handler import CPUFaceHandler
+def get_scaled_bbox(
+    bbox, img_w, img_h, ratio: float = 1.0, face_image: Image.Image = None
+):
+    """
+    根据人脸边界框计算缩放后的裁剪区域
+    Args:
+        bbox: 人脸边界框 [x1, y1, x2, y2]
+        img_w: 图像宽度
+        img_h: 图像高度
+        ratio: 缩放比例，数值越大，人脸在画面中的比例越小（周围留白越多）
+        face_image: PIL Image 对象
+    Returns:
+        裁剪后的人脸图像
+    """
+    x1, y1, x2, y2 = bbox
+    # Calculate center point
+    center_x = (x1 + x2) / 2
+    center_y = (y1 + y2) / 2
+    # Calculate width and height
+    width = x2 - x1
+    # Scale width and height
+    new_width = width * ratio
+    new_height = new_width
+    # tile pix
+    dis_x_left = new_width * 0.5
+    dis_x_right = new_width - dis_x_left  # 0.5new_width
+    dis_y_up = new_height * 0.55
+    dis_y_down = new_height - dis_y_up  # 0.45new_height
+    # Calculate new coordinates
+    new_x1 = int(max(0, center_x - dis_x_left))
+    new_y1 = int(max(0, center_y - dis_y_up))
+    new_x2 = int(min(img_w, center_x + dis_x_right))
+    new_y2 = int(min(img_h, center_y + dis_y_down))
+    scaled_bbox = [new_x1, new_y1, new_x2, new_y2]
+    crop_face = face_image.crop(scaled_bbox)
+    return crop_face
+def process_image(
+    input_path,
+    face_ratio=2.0,
+    target_size=(512, 512),
+):
+    """
+    处理单张图像，进行人脸检测和裁剪
+    Args:
+        input_path: 输入图像路径
+        face_ratio: 人脸缩放比例，建议范围：1.5-3.0，默认2.0
+        target_size: 输出图像尺寸，默认(512, 512)
+    Returns:
+        imgae: 处理后的图像
+    """
+    # 初始化人脸检测器
+    face_detector = CPUFaceHandler()
+    # 验证输入文件
+    if not os.path.isfile(input_path):
+        raise ValueError(f"File not found: {input_path}")
+    try:
+        # 读取图像
+        image = Image.open(input_path)
+        image = image.convert("RGB")
+        image_rgb = np.array(image)
+        img_h, img_w = image_rgb.shape[:2]
+        # 检测人脸
+        boxes, scores = face_detector(image_rgb)
+        if len(boxes) == 0:
+            raise ValueError("No face detected")
+        # 转换边界框坐标（从相对坐标转为绝对坐标）
+        boxes_abs = [
+            boxes[0][0] * img_w,
+            boxes[0][1] * img_h,
+            boxes[0][2] * img_w,
+            boxes[0][3] * img_h
+        ]
+        # 裁剪人脸
+        crop_face = get_scaled_bbox(boxes_abs, img_w, img_h, face_ratio, image)
+        # 调整大小
+        crop_face = crop_face.resize(target_size)
+        return crop_face
+    except Exception as e:
+        raise ValueError(f"Error processing {input_path}: {e}")

flash_head/utils/utils.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from PIL import Image
+import torchvision.transforms as transforms
+import torch.nn as nn
+import pyloudnorm as pyln
+def rgb_to_lab_torch(rgb: torch.Tensor) -> torch.Tensor:
+    """
+    PyTorch GPU版本：RGB转Lab颜色空间（输入范围[0,1]，张量形状任意，最后一维为通道数）
+    参考CIE 1931标准转换公式
+    """
+    # 转换为线性RGB（sRGB伽马校正逆过程）
+    linear_rgb = torch.where(
+        rgb > 0.04045,
+        ((rgb + 0.055) / 1.055) ** 2.4,
+        rgb / 12.92
+    )
+    # 线性RGB转XYZ（使用sRGB标准白点D65）
+    xyz_from_rgb = torch.tensor([
+        [0.4124564, 0.3575761, 0.1804375],
+        [0.2126729, 0.7151522, 0.0721750],
+        [0.0193339, 0.1191920, 0.9503041]
+    ], dtype=rgb.dtype, device=rgb.device)
+    # 维度适配：确保输入为(B, ..., C)，矩阵乘法后保持空间维度
+    shape = linear_rgb.shape
+    linear_rgb_flat = linear_rgb.reshape(-1, 3)  # (N, 3)，N=B*T*H*W
+    xyz_flat = linear_rgb_flat @ xyz_from_rgb.T  # (N, 3)
+    xyz = xyz_flat.reshape(shape)  # 恢复原形状
+    # XYZ转Lab（使用D65白点参数）
+    xyz_ref = torch.tensor([0.95047, 1.0, 1.08883], dtype=rgb.dtype, device=rgb.device)
+    xyz_normalized = xyz / xyz_ref[None, None, None, None, :]  # 广播适配(B, C, T, H, W)
+    # 应用Lab转换公式
+    epsilon = 0.008856
+    kappa = 903.3
+    xyz_normalized = torch.clamp(xyz_normalized, 1e-8, 1.0)  # 避免log(0)
+    f_xyz = torch.where(
+        xyz_normalized > epsilon,
+        xyz_normalized ** (1/3),
+        (kappa * xyz_normalized + 16) / 116
+    )
+    L = 116 * f_xyz[..., 1] - 16  # Y通道对应亮度
+    a = 500 * (f_xyz[..., 0] - f_xyz[..., 1])  # X-Y对应红绿
+    b = 200 * (f_xyz[..., 1] - f_xyz[..., 2])  # Y-Z对应蓝黄
+    lab = torch.stack([L, a, b], dim=-1)  # 最后一维拼接为Lab通道
+    return lab
+def lab_to_rgb_torch(lab: torch.Tensor) -> torch.Tensor:
+    """
+    PyTorch GPU版本：Lab转RGB颜色空间（输出范围[0,1]，张量形状任意，最后一维为通道数）
+    """
+    # Lab分离通道
+    L = lab[..., 0]
+    a = lab[..., 1]
+    b = lab[..., 2]
+    # Lab转XYZ
+    f_y = (L + 16) / 116
+    f_x = (a / 500) + f_y
+    f_z = f_y - (b / 200)
+    epsilon = 0.008856
+    kappa = 903.3
+    x = torch.where(f_x ** 3 > epsilon, f_x ** 3, (116 * f_x - 16) / kappa)
+    y = torch.where(L > kappa * epsilon, ((L + 16) / 116) ** 3, L / kappa)
+    z = torch.where(f_z ** 3 > epsilon, f_z ** 3, (116 * f_z - 16) / kappa)
+    # 乘以D65白点参数
+    xyz_ref = torch.tensor([0.95047, 1.0, 1.08883], dtype=lab.dtype, device=lab.device)
+    xyz = torch.stack([x, y, z], dim=-1) * xyz_ref[None, None, None, None, :]
+    # XYZ转线性RGB
+    rgb_from_xyz = torch.tensor([
+        [3.2404542, -1.5371385, -0.4985314],
+        [-0.9692660, 1.8760108, 0.0415560],
+        [0.0556434, -0.2040259, 1.0572252]
+    ], dtype=lab.dtype, device=lab.device)
+    # 维度适配：矩阵乘法
+    shape = xyz.shape
+    xyz_flat = xyz.reshape(-1, 3)  # (N, 3)
+    linear_rgb_flat = xyz_flat @ rgb_from_xyz.T  # (N, 3)
+    linear_rgb = linear_rgb_flat.reshape(shape)  # 恢复原形状
+    # 线性RGB转sRGB（伽马校正）
+    rgb = torch.where(
+        linear_rgb > 0.0031308,
+        1.055 * (linear_rgb ** (1/2.4)) - 0.055,
+        12.92 * linear_rgb
+    )
+    # 确保输出在[0,1]范围内
+    rgb = torch.clamp(rgb, 0.0, 1.0)
+    return rgb
+def match_and_blend_colors_torch(
+    source_chunk: torch.Tensor,
+    reference_image: torch.Tensor,
+    strength: float
+) -> torch.Tensor:
+    """
+    全GPU批量运算版本：将视频chunk的颜色匹配到参考图像并混合（支持B>1、T帧并行）
+    Args:
+        source_chunk (torch.Tensor): 视频chunk (B, C, T, H, W)，范围[-1, 1]
+        reference_image (torch.Tensor): 参考图像 (B, C, 1, H, W)，范围[-1, 1]（B需与source_chunk一致）
+        strength (float): 颜色校正强度 (0.0-1.0)，0.0无校正，1.0完全校正
+    Returns:
+        torch.Tensor: 颜色校正后的视频chunk (B, C, T, H, W)，范围[-1, 1]
+    """
+    # 强度为0直接返回原图
+    if strength <= 0.0:
+        return source_chunk.clone()
+    # 验证强度范围
+    if not 0.0 <= strength <= 1.0:
+        raise ValueError(f"Strength必须在0.0-1.0之间，当前值：{strength}")
+    # 验证输入形状（确保B一致，参考图T=1）
+    B, C, T, H, W = source_chunk.shape
+    assert reference_image.shape == (B, C, 1, H, W), \
+        f"参考图像形状需为(B, C, 1, H, W)，当前为{reference_image.shape}"
+    assert C == 3, f"仅支持3通道RGB图像，当前通道数：{C}"
+    # 保持设备和数据类型一致
+    device = source_chunk.device
+    dtype = source_chunk.dtype
+    reference_image = reference_image.to(device=device, dtype=dtype)
+    # 1. 从[-1,1]转换到[0,1]（GPU上直接运算）
+    source_01 = (source_chunk + 1.0) / 2.0
+    ref_01 = (reference_image + 1.0) / 2.0
+    # 2. 调整维度顺序：(B, C, T, H, W) → (B, T, H, W, C)（适配颜色空间转换）
+    # 参考图：(B, C, 1, H, W) → (B, 1, H, W, C)
+    source_permuted = source_01.permute(0, 2, 3, 4, 1)  # 通道移到最后一维
+    ref_permuted = ref_01.permute(0, 2, 3, 4, 1)
+    # 3. RGB转Lab（批量处理所有帧）
+    source_lab = rgb_to_lab_torch(source_permuted)
+    ref_lab = rgb_to_lab_torch(ref_permuted)  # (B, 1, H, W, 3)
+    # 4. 批量颜色迁移：匹配L/a/b通道的均值和标准差（核心逻辑）
+    # 计算参考图各通道的均值和标准差（对H、W维度求统计，保持B维度）
+    ref_mean = ref_lab.mean(dim=[2, 3], keepdim=True)  # (B, 1, 1, 1, 3)
+    ref_std = ref_lab.std(dim=[2, 3], keepdim=True, unbiased=False)  # (B, 1, 1, 1, 3)
+    # 计算源视频各通道的均值和标准差（对H、W维度求统计，保持B、T维度）
+    source_mean = source_lab.mean(dim=[2, 3], keepdim=True)  # (B, T, 1, 1, 3)
+    source_std = source_lab.std(dim=[2, 3], keepdim=True, unbiased=False)  # (B, T, 1, 1, 3)
+    # 避免标准差为0的除法错误（用1.0替代0）
+    source_std_safe = torch.where(source_std < 1e-8, torch.ones_like(source_std), source_std)
+    # 颜色迁移公式：(源 - 源均值) * (参考标准差/源标准差) + 参考均值
+    corrected_lab = (source_lab - source_mean) * (ref_std / source_std_safe) + ref_mean
+    # 5. Lab转RGB（批量转换所有校正后的帧）
+    corrected_rgb_01 = lab_to_rgb_torch(corrected_lab)
+    # 6. 批量混合原始帧和校正帧（按强度加权）
+    blended_rgb_01 = (1 - strength) * source_permuted + strength * corrected_rgb_01
+    # 7. 还原维度顺序和数值范围：(B, T, H, W, C) → (B, C, T, H, W)，范围[0,1]→[-1,1]
+    blended_rgb_01 = blended_rgb_01.permute(0, 4, 1, 2, 3)  # 通道移回第二维
+    blended_rgb_minus1_1 = (blended_rgb_01 * 2.0) - 1.0
+    # 8. 确保输出格式正确（连续内存布局）
+    output = blended_rgb_minus1_1.contiguous().to(device=device, dtype=dtype)
+    return output
+def resize_and_centercrop(cond_image, target_size):
+    """
+    Resize image or tensor to the target size without padding.
+    """
+    # Get the original size
+    if isinstance(cond_image, torch.Tensor):
+        _, orig_h, orig_w = cond_image.shape
+    else:
+        orig_h, orig_w = cond_image.height, cond_image.width
+    target_h, target_w = target_size
+    # Calculate the scaling factor for resizing
+    scale_h = target_h / orig_h
+    scale_w = target_w / orig_w
+    # Compute the final size
+    scale = max(scale_h, scale_w)
+    final_h = math.ceil(scale * orig_h)
+    final_w = math.ceil(scale * orig_w)
+    # Resize
+    if isinstance(cond_image, torch.Tensor):
+        if len(cond_image.shape) == 3:
+            cond_image = cond_image[None]
+        resized_tensor = nn.functional.interpolate(cond_image, size=(final_h, final_w), mode='nearest').contiguous()
+        # crop
+        cropped_tensor = transforms.functional.center_crop(resized_tensor, target_size)
+        cropped_tensor = cropped_tensor.squeeze(0)
+    else:
+        resized_image = cond_image.resize((final_w, final_h), resample=Image.BILINEAR)
+        resized_image = np.array(resized_image)
+        # tensor and crop
+        resized_tensor = torch.from_numpy(resized_image)[None, ...].permute(0, 3, 1, 2).contiguous()
+        cropped_tensor = transforms.functional.center_crop(resized_tensor, target_size)
+        cropped_tensor = cropped_tensor[:, :, None, :, :]
+    return cropped_tensor

flash_head/wan/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .vae import WanVAE
+__all__ = [
+    'WanVAE',
+]

flash_head/wan/modules/vae.py ADDED Viewed

	@@ -0,0 +1,1598 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from loguru import logger
+__all__ = [
+    "WanVAE",
+]
+CACHE_T = 2
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            self.padding[2],
+            self.padding[2],
+            self.padding[1],
+            self.padding[1],
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return (
+            F.normalize(x, dim=(1 if self.channel_first else -1))
+            * self.scale
+            * self.gamma
+            + self.bias
+        )
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim // 2, 3, padding=1),
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2))
+            )
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)
+            )
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] != "Rep"
+                    ):
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1, :, :]
+                                .unsqueeze(2)
+                                .to(cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if (
+                        cache_x.shape[2] < 2
+                        and feat_cache[idx] is not None
+                        and feat_cache[idx] == "Rep"
+                    ):
+                        cache_x = torch.cat(
+                            [torch.zeros_like(cache_x).to(cache_x.device), cache_x],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)
+                    )
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        # conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        # init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = (
+            CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = (
+            self.to_qkv(x)
+            .reshape(b * t, 1, c * 3, -1)
+            .permute(0, 1, 3, 2)
+            .contiguous()
+            .chunk(3, dim=-1)
+        )
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1),
+        )
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :]
+                            .unsqueeze(2)
+                            .to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+        # The minimal distance between two spatial tiles
+        self.tile_sample_stride_height = 192
+        self.tile_sample_stride_width = 192
+        # modules
+        self.encoder = Encoder3d(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(
+            dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def blend_v(self, a, b, blend_extent):
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (
+                1 - y / blend_extent
+            ) + b[:, :, :, y, :] * (y / blend_extent)
+        return b
+    def blend_h(self, a, b, blend_extent):
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (
+                1 - x / blend_extent
+            ) + b[:, :, :, :, x] * (x / blend_extent)
+        return b
+    def tiled_encode(self, x, scale):
+        _, _, num_frames, height, width = x.shape
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+        tile_latent_min_height = (
+            self.tile_sample_min_height // self.spatial_compression_ratio
+        )
+        tile_latent_min_width = (
+            self.tile_sample_min_width // self.spatial_compression_ratio
+        )
+        tile_latent_stride_height = (
+            self.tile_sample_stride_height // self.spatial_compression_ratio
+        )
+        tile_latent_stride_width = (
+            self.tile_sample_stride_width // self.spatial_compression_ratio
+        )
+        blend_height = tile_latent_min_height - tile_latent_stride_height
+        blend_width = tile_latent_min_width - tile_latent_stride_width
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, self.tile_sample_stride_height):
+            row = []
+            for j in range(0, width, self.tile_sample_stride_width):
+                self.clear_cache()
+                time = []
+                frame_range = 1 + (num_frames - 1) // 4
+                for k in range(frame_range):
+                    self._enc_conv_idx = [0]
+                    if k == 0:
+                        tile = x[
+                            :,
+                            :,
+                            :1,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    else:
+                        tile = x[
+                            :,
+                            :,
+                            1 + 4 * (k - 1) : 1 + 4 * k,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    tile = self.encoder(
+                        tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx
+                    )
+                    mu, log_var = self.conv1(tile).chunk(2, dim=1)
+                    if isinstance(scale[0], torch.Tensor):
+                        mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[
+                            1
+                        ].view(1, self.z_dim, 1, 1, 1)
+                    else:
+                        mu = (mu - scale[0]) * scale[1]
+                    time.append(mu)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(
+                    tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width]
+                )
+            result_rows.append(torch.cat(result_row, dim=-1))
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+    def tiled_decode(self, z, scale):
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            z = z / scale[1] + scale[0]
+        _, _, num_frames, height, width = z.shape
+        sample_height = height * self.spatial_compression_ratio
+        sample_width = width * self.spatial_compression_ratio
+        tile_latent_min_height = (
+            self.tile_sample_min_height // self.spatial_compression_ratio
+        )
+        tile_latent_min_width = (
+            self.tile_sample_min_width // self.spatial_compression_ratio
+        )
+        tile_latent_stride_height = (
+            self.tile_sample_stride_height // self.spatial_compression_ratio
+        )
+        tile_latent_stride_width = (
+            self.tile_sample_stride_width // self.spatial_compression_ratio
+        )
+        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
+        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, tile_latent_stride_height):
+            row = []
+            for j in range(0, width, tile_latent_stride_width):
+                self.clear_cache()
+                time = []
+                for k in range(num_frames):
+                    self._conv_idx = [0]
+                    tile = z[
+                        :,
+                        :,
+                        k : k + 1,
+                        i : i + tile_latent_min_height,
+                        j : j + tile_latent_min_width,
+                    ]
+                    tile = self.conv2(tile)
+                    decoded = self.decoder(
+                        tile, feat_cache=self._feat_map, feat_idx=self._conv_idx
+                    )
+                    time.append(decoded)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(
+                    tile[
+                        :,
+                        :,
+                        :,
+                        : self.tile_sample_stride_height,
+                        : self.tile_sample_stride_width,
+                    ]
+                )
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+        return dec
+    def encode(self, x, scale, return_mu=False):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        if return_mu:
+            return mu, log_var
+        else:
+            return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+    def decode_stream(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            out = self.decoder(
+                x[:, :, i : i + 1, :, :],
+                feat_cache=self._feat_map,
+                feat_idx=self._conv_idx,
+            )
+            yield out
+    def cached_decode(self, z, scale):
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1
+            )
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False, scale=[0, 1]):
+        mu, log_var = self.encode(imgs, scale, return_mu=True)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std), mu, log_var
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+    def encode_video(self, x, scale=[0, 1]):
+        assert x.ndim == 5  # NTCHW
+        assert x.shape[2] % 3 == 0
+        x = x.transpose(1, 2)
+        y = x.mul(2).sub_(1)
+        y, mu, log_var = self.sample(y, scale=scale)
+        return y.transpose(1, 2).to(x), mu, log_var
+    def decode_video(self, x, scale=[0, 1]):
+        assert x.ndim == 5  # NTCHW
+        assert x.shape[2] % self.z_dim == 0
+        x = x.transpose(1, 2)
+        # B, C, T, H, W
+        y = x
+        y = self.decode(y, scale).clamp_(-1, 1)
+        y = y.mul_(0.5).add_(0.5).clamp_(0, 1)  # NCTHW
+        return y.transpose(1, 2).to(x)
+def _video_vae(
+    pretrained_path=None,
+    z_dim=None,
+    device="cpu",
+    dtype=torch.float,
+    **kwargs,
+):
+    """
+    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
+    """
+    # params
+    cfg = dict(
+        dim=96,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[False, True, True],
+        dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    # init model
+    with torch.device("meta"):
+        model = WanVAE_(**cfg)
+    # load checkpoint
+    model.load_state_dict(torch.load(pretrained_path, map_location=device), assign=True)
+    return model
+class WanVAE:
+    def __init__(
+        self,
+        z_dim=16,
+        vae_path="cache/vae_step_411000.pth",
+        dtype=torch.float,
+        device="cuda",
+        parallel=False,
+        use_tiling=False,
+        use_2d_split=True,
+    ):
+        self.dtype = dtype
+        self.device = device
+        self.parallel = parallel
+        self.use_tiling = use_tiling
+        self.use_2d_split = use_2d_split
+        mean = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ]
+        std = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ]
+        self.mean = torch.tensor(mean, dtype=dtype, device=device)
+        self.inv_std = 1.0 / torch.tensor(std, dtype=dtype, device=device)
+        self.scale = [self.mean, self.inv_std]
+        # (height, width, world_size) -> (world_size_h, world_size_w)
+        self.grid_table = {
+            # world_size = 2
+            (60, 104, 2): (1, 2),
+            (68, 120, 2): (1, 2),
+            (90, 160, 2): (1, 2),
+            (60, 60, 2): (1, 2),
+            (72, 72, 2): (1, 2),
+            (88, 88, 2): (1, 2),
+            (120, 120, 2): (1, 2),
+            (104, 60, 2): (2, 1),
+            (120, 68, 2): (2, 1),
+            (160, 90, 2): (2, 1),
+            # world_size = 4
+            (60, 104, 4): (2, 2),
+            (68, 120, 4): (2, 2),
+            (90, 160, 4): (2, 2),
+            (60, 60, 4): (2, 2),
+            (72, 72, 4): (2, 2),
+            (88, 88, 4): (2, 2),
+            (120, 120, 4): (2, 2),
+            (104, 60, 4): (2, 2),
+            (120, 68, 4): (2, 2),
+            (160, 90, 4): (2, 2),
+            # world_size = 8
+            (60, 104, 8): (2, 4),
+            (68, 120, 8): (2, 4),
+            (90, 160, 8): (2, 4),
+            (60, 60, 8): (2, 4),
+            (72, 72, 8): (2, 4),
+            (88, 88, 8): (2, 4),
+            (120, 120, 8): (2, 4),
+            (104, 60, 8): (4, 2),
+            (120, 68, 8): (4, 2),
+            (160, 90, 8): (4, 2),
+        }
+        # init model
+        self.model = (
+            _video_vae(
+                pretrained_path=vae_path,
+                z_dim=z_dim,
+                dtype=dtype,
+            )
+            .eval()
+            .requires_grad_(False)
+            .to(device)
+            .to(dtype)
+        )
+    def _calculate_2d_grid(self, latent_height, latent_width, world_size):
+        if (latent_height, latent_width, world_size) in self.grid_table:
+            best_h, best_w = self.grid_table[(latent_height, latent_width, world_size)]
+            # logger.info(f"Vae using cached 2D grid: {best_h}x{best_w} grid for {latent_height}x{latent_width} latent")
+            return best_h, best_w
+        best_h, best_w = 1, world_size
+        min_aspect_diff = float("inf")
+        for h in range(1, world_size + 1):
+            if world_size % h == 0:
+                w = world_size // h
+                if latent_height % h == 0 and latent_width % w == 0:
+                    # Calculate how close this grid is to square
+                    aspect_diff = abs((latent_height / h) - (latent_width / w))
+                    if aspect_diff < min_aspect_diff:
+                        min_aspect_diff = aspect_diff
+                        best_h, best_w = h, w
+        # logger.info(f"Vae using 2D grid & Update cache: {best_h}x{best_w} grid for {latent_height}x{latent_width} latent")
+        self.grid_table[(latent_height, latent_width, world_size)] = (best_h, best_w)
+        return best_h, best_w
+    def current_device(self):
+        return next(self.model.parameters()).device
+    def encode_dist(self, video, world_size, cur_rank, split_dim):
+        spatial_ratio = 8
+        if split_dim == 3:
+            total_latent_len = video.shape[3] // spatial_ratio
+        elif split_dim == 4:
+            total_latent_len = video.shape[4] // spatial_ratio
+        else:
+            raise ValueError(f"Unsupported split_dim: {split_dim}")
+        splited_chunk_len = total_latent_len // world_size
+        padding_size = 1
+        video_chunk_len = splited_chunk_len * spatial_ratio
+        video_padding_len = padding_size * spatial_ratio
+        if cur_rank == 0:
+            if split_dim == 3:
+                video_chunk = video[
+                    :, :, :, : video_chunk_len + 2 * video_padding_len, :
+                ].contiguous()
+            elif split_dim == 4:
+                video_chunk = video[
+                    :, :, :, :, : video_chunk_len + 2 * video_padding_len
+                ].contiguous()
+        elif cur_rank == world_size - 1:
+            if split_dim == 3:
+                video_chunk = video[
+                    :, :, :, -(video_chunk_len + 2 * video_padding_len) :, :
+                ].contiguous()
+            elif split_dim == 4:
+                video_chunk = video[
+                    :, :, :, :, -(video_chunk_len + 2 * video_padding_len) :
+                ].contiguous()
+        else:
+            start_idx = cur_rank * video_chunk_len - video_padding_len
+            end_idx = (cur_rank + 1) * video_chunk_len + video_padding_len
+            if split_dim == 3:
+                video_chunk = video[:, :, :, start_idx:end_idx, :].contiguous()
+            elif split_dim == 4:
+                video_chunk = video[:, :, :, :, start_idx:end_idx].contiguous()
+        if self.use_tiling:
+            encoded_chunk = self.model.tiled_encode(video_chunk, self.scale)
+        else:
+            encoded_chunk = self.model.encode(video_chunk, self.scale)
+        if cur_rank == 0:
+            if split_dim == 3:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, :splited_chunk_len, :
+                ].contiguous()
+            elif split_dim == 4:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, :, :splited_chunk_len
+                ].contiguous()
+        elif cur_rank == world_size - 1:
+            if split_dim == 3:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, -splited_chunk_len:, :
+                ].contiguous()
+            elif split_dim == 4:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, :, -splited_chunk_len:
+                ].contiguous()
+        else:
+            if split_dim == 3:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, padding_size:-padding_size, :
+                ].contiguous()
+            elif split_dim == 4:
+                encoded_chunk = encoded_chunk[
+                    :, :, :, :, padding_size:-padding_size
+                ].contiguous()
+        full_encoded = [torch.empty_like(encoded_chunk) for _ in range(world_size)]
+        dist.all_gather(full_encoded, encoded_chunk)
+        torch.cuda.synchronize()
+        encoded = torch.cat(full_encoded, dim=split_dim)
+        return encoded.squeeze(0)
+    def encode_dist_2d(self, video, world_size_h, world_size_w, cur_rank_h, cur_rank_w):
+        spatial_ratio = 8
+        # Calculate chunk sizes for both dimensions
+        total_latent_h = video.shape[3] // spatial_ratio
+        total_latent_w = video.shape[4] // spatial_ratio
+        chunk_h = total_latent_h // world_size_h
+        chunk_w = total_latent_w // world_size_w
+        padding_size = 1
+        video_chunk_h = chunk_h * spatial_ratio
+        video_chunk_w = chunk_w * spatial_ratio
+        video_padding_h = padding_size * spatial_ratio
+        video_padding_w = padding_size * spatial_ratio
+        # Calculate H dimension slice
+        if cur_rank_h == 0:
+            h_start = 0
+            h_end = video_chunk_h + 2 * video_padding_h
+        elif cur_rank_h == world_size_h - 1:
+            h_start = video.shape[3] - (video_chunk_h + 2 * video_padding_h)
+            h_end = video.shape[3]
+        else:
+            h_start = cur_rank_h * video_chunk_h - video_padding_h
+            h_end = (cur_rank_h + 1) * video_chunk_h + video_padding_h
+        # Calculate W dimension slice
+        if cur_rank_w == 0:
+            w_start = 0
+            w_end = video_chunk_w + 2 * video_padding_w
+        elif cur_rank_w == world_size_w - 1:
+            w_start = video.shape[4] - (video_chunk_w + 2 * video_padding_w)
+            w_end = video.shape[4]
+        else:
+            w_start = cur_rank_w * video_chunk_w - video_padding_w
+            w_end = (cur_rank_w + 1) * video_chunk_w + video_padding_w
+        # Extract the video chunk for this process
+        video_chunk = video[:, :, :, h_start:h_end, w_start:w_end].contiguous()
+        # Encode the chunk
+        if self.use_tiling:
+            encoded_chunk = self.model.tiled_encode(video_chunk, self.scale)
+        else:
+            encoded_chunk = self.model.encode(video_chunk, self.scale)
+        # Remove padding from encoded chunk
+        if cur_rank_h == 0:
+            encoded_h_start = 0
+            encoded_h_end = chunk_h
+        elif cur_rank_h == world_size_h - 1:
+            encoded_h_start = encoded_chunk.shape[3] - chunk_h
+            encoded_h_end = encoded_chunk.shape[3]
+        else:
+            encoded_h_start = padding_size
+            encoded_h_end = encoded_chunk.shape[3] - padding_size
+        if cur_rank_w == 0:
+            encoded_w_start = 0
+            encoded_w_end = chunk_w
+        elif cur_rank_w == world_size_w - 1:
+            encoded_w_start = encoded_chunk.shape[4] - chunk_w
+            encoded_w_end = encoded_chunk.shape[4]
+        else:
+            encoded_w_start = padding_size
+            encoded_w_end = encoded_chunk.shape[4] - padding_size
+        encoded_chunk = encoded_chunk[
+            :, :, :, encoded_h_start:encoded_h_end, encoded_w_start:encoded_w_end
+        ].contiguous()
+        # Gather all chunks
+        total_processes = world_size_h * world_size_w
+        full_encoded = [torch.empty_like(encoded_chunk) for _ in range(total_processes)]
+        dist.all_gather(full_encoded, encoded_chunk)
+        torch.cuda.synchronize()
+        # Reconstruct the full encoded tensor
+        encoded_rows = []
+        for h_idx in range(world_size_h):
+            encoded_cols = []
+            for w_idx in range(world_size_w):
+                process_idx = h_idx * world_size_w + w_idx
+                encoded_cols.append(full_encoded[process_idx])
+            encoded_rows.append(torch.cat(encoded_cols, dim=4))
+        encoded = torch.cat(encoded_rows, dim=3)
+        return encoded.squeeze(0)
+    def encode(self, video, world_size_h=None, world_size_w=None):
+        """
+        video: one video  with shape [1, C, T, H, W].
+        """
+        if self.parallel:
+            world_size = dist.get_world_size()
+            cur_rank = dist.get_rank()
+            height, width = video.shape[3], video.shape[4]
+            if self.use_2d_split:
+                if world_size_h is None or world_size_w is None:
+                    world_size_h, world_size_w = self._calculate_2d_grid(
+                        height // 8, width // 8, world_size
+                    )
+                cur_rank_h = cur_rank // world_size_w
+                cur_rank_w = cur_rank % world_size_w
+                out = self.encode_dist_2d(
+                    video, world_size_h, world_size_w, cur_rank_h, cur_rank_w
+                )
+            else:
+                # Original 1D splitting logic
+                if width % world_size == 0:
+                    out = self.encode_dist(video, world_size, cur_rank, split_dim=4)
+                elif height % world_size == 0:
+                    out = self.encode_dist(video, world_size, cur_rank, split_dim=3)
+                else:
+                    logger.info("Fall back to naive encode mode")
+                    if self.use_tiling:
+                        out = self.model.tiled_encode(video, self.scale).squeeze(0)
+                    else:
+                        out = self.model.encode(video, self.scale).squeeze(0)
+        else:
+            if self.use_tiling:
+                out = self.model.tiled_encode(video, self.scale).squeeze(0)
+            else:
+                out = self.model.encode(video, self.scale).squeeze(0)
+        return out
+    def decode_dist(self, zs, world_size, cur_rank, split_dim):
+        splited_total_len = zs.shape[split_dim]
+        splited_chunk_len = splited_total_len // world_size
+        padding_size = 1
+        if cur_rank == 0:
+            if split_dim == 2:
+                zs = zs[:, :, : splited_chunk_len + 2 * padding_size, :].contiguous()
+            elif split_dim == 3:
+                zs = zs[:, :, :, : splited_chunk_len + 2 * padding_size].contiguous()
+        elif cur_rank == world_size - 1:
+            if split_dim == 2:
+                zs = zs[:, :, -(splited_chunk_len + 2 * padding_size) :, :].contiguous()
+            elif split_dim == 3:
+                zs = zs[:, :, :, -(splited_chunk_len + 2 * padding_size) :].contiguous()
+        else:
+            if split_dim == 2:
+                zs = zs[
+                    :,
+                    :,
+                    cur_rank * splited_chunk_len - padding_size : (cur_rank + 1)
+                    * splited_chunk_len
+                    + padding_size,
+                    :,
+                ].contiguous()
+            elif split_dim == 3:
+                zs = zs[
+                    :,
+                    :,
+                    :,
+                    cur_rank * splited_chunk_len - padding_size : (cur_rank + 1)
+                    * splited_chunk_len
+                    + padding_size,
+                ].contiguous()
+        decode_func = self.model.tiled_decode if self.use_tiling else self.model.decode
+        images = decode_func(zs.unsqueeze(0), self.scale).clamp_(-1, 1)
+        if cur_rank == 0:
+            if split_dim == 2:
+                images = images[:, :, :, : splited_chunk_len * 8, :].contiguous()
+            elif split_dim == 3:
+                images = images[:, :, :, :, : splited_chunk_len * 8].contiguous()
+        elif cur_rank == world_size - 1:
+            if split_dim == 2:
+                images = images[:, :, :, -splited_chunk_len * 8 :, :].contiguous()
+            elif split_dim == 3:
+                images = images[:, :, :, :, -splited_chunk_len * 8 :].contiguous()
+        else:
+            if split_dim == 2:
+                images = images[
+                    :, :, :, 8 * padding_size : -8 * padding_size, :
+                ].contiguous()
+            elif split_dim == 3:
+                images = images[
+                    :, :, :, :, 8 * padding_size : -8 * padding_size
+                ].contiguous()
+        full_images = [torch.empty_like(images) for _ in range(world_size)]
+        dist.all_gather(full_images, images)
+        torch.cuda.synchronize()
+        images = torch.cat(full_images, dim=split_dim + 1)
+        return images
+    def decode_dist_2d(self, zs, world_size_h, world_size_w, cur_rank_h, cur_rank_w):
+        total_h = zs.shape[2]
+        total_w = zs.shape[3]
+        chunk_h = total_h // world_size_h
+        chunk_w = total_w // world_size_w
+        padding_size = 2
+        # Calculate H dimension slice
+        if cur_rank_h == 0:
+            h_start = 0
+            h_end = chunk_h + 2 * padding_size
+        elif cur_rank_h == world_size_h - 1:
+            h_start = total_h - (chunk_h + 2 * padding_size)
+            h_end = total_h
+        else:
+            h_start = cur_rank_h * chunk_h - padding_size
+            h_end = (cur_rank_h + 1) * chunk_h + padding_size
+        # Calculate W dimension slice
+        if cur_rank_w == 0:
+            w_start = 0
+            w_end = chunk_w + 2 * padding_size
+        elif cur_rank_w == world_size_w - 1:
+            w_start = total_w - (chunk_w + 2 * padding_size)
+            w_end = total_w
+        else:
+            w_start = cur_rank_w * chunk_w - padding_size
+            w_end = (cur_rank_w + 1) * chunk_w + padding_size
+        # Extract the latent chunk for this process
+        zs_chunk = zs[:, :, h_start:h_end, w_start:w_end].contiguous()
+        # Decode the chunk
+        decode_func = self.model.tiled_decode if self.use_tiling else self.model.decode
+        images_chunk = decode_func(zs_chunk.unsqueeze(0), self.scale).clamp_(-1, 1)
+        # Remove padding from decoded chunk
+        spatial_ratio = 8
+        if cur_rank_h == 0:
+            decoded_h_start = 0
+            decoded_h_end = chunk_h * spatial_ratio
+        elif cur_rank_h == world_size_h - 1:
+            decoded_h_start = images_chunk.shape[3] - chunk_h * spatial_ratio
+            decoded_h_end = images_chunk.shape[3]
+        else:
+            decoded_h_start = padding_size * spatial_ratio
+            decoded_h_end = images_chunk.shape[3] - padding_size * spatial_ratio
+        if cur_rank_w == 0:
+            decoded_w_start = 0
+            decoded_w_end = chunk_w * spatial_ratio
+        elif cur_rank_w == world_size_w - 1:
+            decoded_w_start = images_chunk.shape[4] - chunk_w * spatial_ratio
+            decoded_w_end = images_chunk.shape[4]
+        else:
+            decoded_w_start = padding_size * spatial_ratio
+            decoded_w_end = images_chunk.shape[4] - padding_size * spatial_ratio
+        images_chunk = images_chunk[
+            :, :, :, decoded_h_start:decoded_h_end, decoded_w_start:decoded_w_end
+        ].contiguous()
+        # Gather all chunks
+        total_processes = world_size_h * world_size_w
+        full_images = [torch.empty_like(images_chunk) for _ in range(total_processes)]
+        dist.all_gather(full_images, images_chunk)
+        torch.cuda.synchronize()
+        # Reconstruct the full image tensor
+        image_rows = []
+        for h_idx in range(world_size_h):
+            image_cols = []
+            for w_idx in range(world_size_w):
+                process_idx = h_idx * world_size_w + w_idx
+                image_cols.append(full_images[process_idx])
+            image_rows.append(torch.cat(image_cols, dim=4))
+        images = torch.cat(image_rows, dim=3)
+        return images
+    def decode_dist_2d_stream(
+        self, zs, world_size_h, world_size_w, cur_rank_h, cur_rank_w
+    ):
+        total_h = zs.shape[2]
+        total_w = zs.shape[3]
+        chunk_h = total_h // world_size_h
+        chunk_w = total_w // world_size_w
+        padding_size = 2
+        # Calculate H dimension slice
+        if cur_rank_h == 0:
+            h_start = 0
+            h_end = chunk_h + 2 * padding_size
+        elif cur_rank_h == world_size_h - 1:
+            h_start = total_h - (chunk_h + 2 * padding_size)
+            h_end = total_h
+        else:
+            h_start = cur_rank_h * chunk_h - padding_size
+            h_end = (cur_rank_h + 1) * chunk_h + padding_size
+        # Calculate W dimension slice
+        if cur_rank_w == 0:
+            w_start = 0
+            w_end = chunk_w + 2 * padding_size
+        elif cur_rank_w == world_size_w - 1:
+            w_start = total_w - (chunk_w + 2 * padding_size)
+            w_end = total_w
+        else:
+            w_start = cur_rank_w * chunk_w - padding_size
+            w_end = (cur_rank_w + 1) * chunk_w + padding_size
+        # Extract the latent chunk for this process
+        zs_chunk = zs[:, :, h_start:h_end, w_start:w_end].contiguous()
+        for image in self.model.decode_stream(zs_chunk.unsqueeze(0), self.scale):
+            images_chunk = image.clamp_(-1, 1)
+            # Remove padding from decoded chunk
+            spatial_ratio = 8
+            if cur_rank_h == 0:
+                decoded_h_start = 0
+                decoded_h_end = chunk_h * spatial_ratio
+            elif cur_rank_h == world_size_h - 1:
+                decoded_h_start = images_chunk.shape[3] - chunk_h * spatial_ratio
+                decoded_h_end = images_chunk.shape[3]
+            else:
+                decoded_h_start = padding_size * spatial_ratio
+                decoded_h_end = images_chunk.shape[3] - padding_size * spatial_ratio
+            if cur_rank_w == 0:
+                decoded_w_start = 0
+                decoded_w_end = chunk_w * spatial_ratio
+            elif cur_rank_w == world_size_w - 1:
+                decoded_w_start = images_chunk.shape[4] - chunk_w * spatial_ratio
+                decoded_w_end = images_chunk.shape[4]
+            else:
+                decoded_w_start = padding_size * spatial_ratio
+                decoded_w_end = images_chunk.shape[4] - padding_size * spatial_ratio
+            images_chunk = images_chunk[
+                :, :, :, decoded_h_start:decoded_h_end, decoded_w_start:decoded_w_end
+            ].contiguous()
+            # Gather all chunks
+            total_processes = world_size_h * world_size_w
+            full_images = [
+                torch.empty_like(images_chunk) for _ in range(total_processes)
+            ]
+            dist.all_gather(full_images, images_chunk)
+            torch.cuda.synchronize()
+            # Reconstruct the full image tensor
+            image_rows = []
+            for h_idx in range(world_size_h):
+                image_cols = []
+                for w_idx in range(world_size_w):
+                    process_idx = h_idx * world_size_w + w_idx
+                    image_cols.append(full_images[process_idx])
+                image_rows.append(torch.cat(image_cols, dim=4))
+            images = torch.cat(image_rows, dim=3)
+            yield images
+    def decode(self, zs):
+        if self.parallel:
+            world_size = dist.get_world_size()
+            cur_rank = dist.get_rank()
+            latent_height, latent_width = zs.shape[2], zs.shape[3]
+            if self.use_2d_split:
+                world_size_h, world_size_w = self._calculate_2d_grid(
+                    latent_height, latent_width, world_size
+                )
+                cur_rank_h = cur_rank // world_size_w
+                cur_rank_w = cur_rank % world_size_w
+                images = self.decode_dist_2d(
+                    zs, world_size_h, world_size_w, cur_rank_h, cur_rank_w
+                )
+            else:
+                # Original 1D splitting logic
+                if latent_width % world_size == 0:
+                    images = self.decode_dist(zs, world_size, cur_rank, split_dim=3)
+                elif latent_height % world_size == 0:
+                    images = self.decode_dist(zs, world_size, cur_rank, split_dim=2)
+                else:
+                    logger.info("Fall back to naive decode mode")
+                    images = self.model.decode(zs.unsqueeze(0), self.scale).clamp_(
+                        -1, 1
+                    )
+        else:
+            decode_func = (
+                self.model.tiled_decode if self.use_tiling else self.model.decode
+            )
+            images = decode_func(zs.unsqueeze(0), self.scale).clamp_(-1, 1)
+        return images
+    def decode_stream(self, zs):
+        if self.parallel:
+            world_size = dist.get_world_size()
+            cur_rank = dist.get_rank()
+            latent_height, latent_width = zs.shape[2], zs.shape[3]
+            world_size_h, world_size_w = self._calculate_2d_grid(
+                latent_height, latent_width, world_size
+            )
+            cur_rank_h = cur_rank // world_size_w
+            cur_rank_w = cur_rank % world_size_w
+            for images in self.decode_dist_2d_stream(
+                zs, world_size_h, world_size_w, cur_rank_h, cur_rank_w
+            ):
+                yield images
+        else:
+            for image in self.model.decode_stream(zs.unsqueeze(0), self.scale):
+                yield image.clamp_(-1, 1)
+    def encode_video(self, vid):
+        return self.model.encode_video(vid)
+    def decode_video(self, vid_enc):
+        return self.model.decode_video(vid_enc)

generate_video.py ADDED Viewed

	@@ -0,0 +1,218 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import argparse
+import os
+import numpy as np
+import time
+import torch
+import torch.distributed as dist
+import subprocess
+import imageio
+import librosa
+import numpy as np
+from loguru import logger
+from collections import deque
+from datetime import datetime
+from flash_head.inference import get_pipeline, get_base_data, get_infer_params, get_audio_embedding, run_pipeline
+def _validate_args(args):
+    # Basic check
+    assert args.ckpt_dir is not None, "Please specify FlashHead model checkpoint directory."
+    assert args.wav2vec_dir is not None, "Please specify the wav2vec checkpoint directory."
+    assert args.model_type=="pro" or args.model_type=="lite", "Please specify the model name (pro, lite)."
+    assert args.cond_image_dir is not None or args.cond_image is not None, "Please specify the condition image or directory."
+    assert args.audio_path is not None, "Please specify the audio path."
+    args.base_seed = args.base_seed if args.base_seed >= 0 else 42
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate video from one image using FlashHead"
+    )
+    parser.add_argument(
+        "--ckpt_dir",
+        type=str,
+        default=None,
+        help="The path to FlashHead model checkpoint directory.")
+    parser.add_argument(
+        "--wav2vec_dir",
+        type=str,
+        default=None,
+        help="The path to the wav2vec checkpoint directory.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Choose from pro or lite.")
+    parser.add_argument(
+        "--save_file",
+        type=str,
+        default=None,
+        help="The file to save the generated video to.")
+    parser.add_argument(
+        "--base_seed",
+        type=int,
+        default=42,
+        help="The seed to use for generating the video.")
+    parser.add_argument(
+        "--cond_image",
+        type=str,
+        default=None,
+        help="[meta file] The condition image path to generate the video.")
+    parser.add_argument(
+        "--cond_image_dir",
+        type=str,
+        default=None,
+        help="[meta directory] The directory of condition images.")
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default=None,
+        help="[meta file] The audio path to generate the video.")
+    parser.add_argument(
+        "--audio_encode_mode",
+        type=str,
+        default="stream",
+        choices=['stream', 'once'],
+        help="stream: encode audio chunk before every generation; once: encode audio together")
+    parser.add_argument(
+        "--use_face_crop",
+        type=bool,
+        default=False,
+        help="Enable face detection and crop for condition image")
+    args = parser.parse_args()
+    args = parser.parse_args()
+    _validate_args(args)
+    return args
+def save_video(frames_list, video_path, audio_path, fps):
+    temp_video_path = video_path.replace('.mp4', '_tmp.mp4')
+    with imageio.get_writer(temp_video_path, format='mp4', mode='I',
+                            fps=fps , codec='h264', ffmpeg_params=['-bf', '0']) as writer:
+        for frames in frames_list:
+            frames = frames.numpy().astype(np.uint8)
+            for i in range(frames.shape[0]):
+                frame = frames[i, :, :, :]
+                writer.append_data(frame)
+    # merge video and audio
+    cmd = ['ffmpeg', '-i', temp_video_path, '-i', audio_path, '-c:v', 'copy', '-c:a', 'aac', '-shortest', video_path, '-y']
+    subprocess.run(cmd)
+    os.remove(temp_video_path)
+def generate(args):
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    rank = int(os.environ.get("RANK", 0))
+    pipeline = get_pipeline(world_size=world_size, ckpt_dir=args.ckpt_dir, wav2vec_dir=args.wav2vec_dir, model_type=args.model_type)
+    get_base_data(pipeline, cond_image_path_or_dir=args.cond_image_dir if args.cond_image_dir is not None else args.cond_image, base_seed=args.base_seed, use_face_crop=args.use_face_crop)
+    infer_params = get_infer_params()
+    sample_rate = infer_params['sample_rate']
+    tgt_fps = infer_params['tgt_fps']
+    cached_audio_duration = infer_params['cached_audio_duration']
+    frame_num = infer_params['frame_num']
+    motion_frames_num = infer_params['motion_frames_num']
+    slice_len = frame_num - motion_frames_num
+    human_speech_array_all, _ = librosa.load(args.audio_path, sr=infer_params['sample_rate'], mono=True)
+    human_speech_array_slice_len = slice_len * sample_rate // tgt_fps
+    human_speech_array_frame_num = frame_num * sample_rate // tgt_fps
+    if rank == 0:
+        logger.info("Data preparation done. Start to generate video...")
+    generated_list = []
+    if args.audio_encode_mode == 'once':
+        # pad audio with silence to avoid truncating the last chunk
+        remainder = (len(human_speech_array_all) - human_speech_array_frame_num) % human_speech_array_slice_len
+        if remainder > 0:
+            pad_length = human_speech_array_slice_len - remainder
+            human_speech_array_all = np.concatenate([human_speech_array_all, np.zeros(pad_length, dtype=human_speech_array_all.dtype)])
+        # encode audio together
+        audio_embedding_all = get_audio_embedding(pipeline, human_speech_array_all)
+        # split audio embedding into chunks
+        # for Pro model: 33, 28, 28, 28, ...; For Lite model: 33, 24, 24, 24, ...
+        audio_embedding_chunks_list = [audio_embedding_all[:, i * slice_len: i * slice_len + frame_num].contiguous() for i in range((audio_embedding_all.shape[1]-frame_num) // slice_len)]
+        for chunk_idx, audio_embedding_chunk in enumerate(audio_embedding_chunks_list):
+            torch.cuda.synchronize()
+            start_time = time.time()
+            # inference
+            video = run_pipeline(pipeline, audio_embedding_chunk)
+            if chunk_idx != 0:
+                video = video[motion_frames_num:]
+            torch.cuda.synchronize()
+            end_time = time.time()
+            if rank == 0:
+                logger.info(f"Generate video chunk-{chunk_idx} done, cost time: {(end_time - start_time):.3f}s")
+            generated_list.append(video.cpu())
+    elif args.audio_encode_mode == 'stream':
+        cached_audio_length_sum = sample_rate * cached_audio_duration
+        audio_end_idx = cached_audio_duration * tgt_fps
+        audio_start_idx = audio_end_idx - frame_num
+        audio_dq = deque([0.0] * cached_audio_length_sum, maxlen=cached_audio_length_sum)
+        # pad audio with silence to avoid truncating the last chunk
+        remainder = len(human_speech_array_all) % human_speech_array_slice_len
+        if remainder > 0:
+            pad_length = human_speech_array_slice_len - remainder
+            human_speech_array_all = np.concatenate([human_speech_array_all, np.zeros(pad_length, dtype=human_speech_array_all.dtype)])
+        # split audio embedding into chunks
+        # for Pro model: 28, 28, 28, 28, ...; For Lite model: 24, 24, 24, 24, ...
+        human_speech_array_slices = human_speech_array_all.reshape(-1, human_speech_array_slice_len)
+        for chunk_idx, human_speech_array in enumerate(human_speech_array_slices):
+            torch.cuda.synchronize()
+            start_time = time.time()
+            # streaming encode audio chunks
+            audio_dq.extend(human_speech_array.tolist())
+            audio_array = np.array(audio_dq)
+            audio_embedding = get_audio_embedding(pipeline, audio_array, audio_start_idx, audio_end_idx)
+            # inference
+            video = run_pipeline(pipeline, audio_embedding)
+            video = video[motion_frames_num:]
+            torch.cuda.synchronize()
+            end_time = time.time()
+            if rank == 0:
+                logger.info(f"Generate video chunk-{chunk_idx} done, cost time: {(end_time - start_time):.3f}s")
+            generated_list.append(video.cpu())
+    if rank == 0:
+        if args.save_file is None:
+            output_dir = 'sample_results'
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            timestamp = datetime.now().strftime("%Y%m%d-%H:%M:%S-%f")[:-3]
+            filename = f"res_{timestamp}.mp4"
+            filepath = os.path.join(output_dir, filename)
+            args.save_file = filepath
+        save_video(generated_list, args.save_file, args.audio_path, fps=tgt_fps)
+        logger.info(f"Saving generated video to {args.save_file}")
+        logger.info("Finished.")
+    if world_size > 1:
+        dist.barrier()
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    args = _parse_args()
+    generate(args)

gradio_app_streaming.py ADDED Viewed

	@@ -0,0 +1,339 @@

+"""
+Gradio 流式视频生成：视频生成&视频保存异步进行，确保实时性
+"""
+import gradio as gr
+import os
+import torch
+import numpy as np
+import time
+import wave
+import imageio
+import librosa
+import subprocess
+import queue
+import threading
+from datetime import datetime
+from collections import deque
+from loguru import logger
+from flash_head.inference import (
+    get_pipeline,
+    get_base_data,
+    get_infer_params,
+    get_audio_embedding,
+    run_pipeline,
+)
+# gr.Video 的 streaming=True 要求视频片段大于1s，实际需要接近3s才能不卡顿。
+# 为了适配，每 3 个 chunk 合并为一段视频
+CHUNKS_PER_SEGMENT = 3
+pipeline = None
+loaded_ckpt_dir = None
+loaded_wav2vec_dir = None
+loaded_model_type = None
+def _write_frames_to_mp4(frames_list, video_path, fps):
+    """将帧列表写入 MP4（仅视频轨）。"""
+    os.makedirs(os.path.dirname(video_path) or ".", exist_ok=True)
+    with imageio.get_writer(
+        video_path,
+        format="mp4",
+        mode="I",
+        fps=fps,
+        codec="h264",
+        ffmpeg_params=["-bf", "0"],
+    ) as writer:
+        for frames in frames_list:
+            frames_np = frames.numpy().astype(np.uint8)
+            for i in range(frames_np.shape[0]):
+                writer.append_data(frames_np[i, :, :, :])
+    return video_path
+def save_video_with_audio(frames_list, video_path, audio_path, fps):
+    """写入完整视频并混入完整音频（-shortest 保证音画同步，yuv420p + faststart 保证浏览器可播）。"""
+    temp_path = video_path.replace(".mp4", "_temp.mp4")
+    _write_frames_to_mp4(frames_list, temp_path, fps)
+    try:
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", temp_path,
+            "-i", audio_path,
+            "-c:v", "copy",
+            "-c:a", "aac",
+            # "-shortest",
+            video_path,
+        ]
+        subprocess.run(cmd, check=True, capture_output=True)
+    finally:
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+    return video_path
+def _save_chunk_audio_to_wav(audio_array, wav_path, sample_rate=16000):
+    """将一段 float32 [-1,1] 的音频数组保存为 wav 文件。"""
+    os.makedirs(os.path.dirname(wav_path) or ".", exist_ok=True)
+    samples = (np.clip(audio_array, -1.0, 1.0) * 32767).astype(np.int16)
+    with wave.open(wav_path, "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(samples.tobytes())
+    return wav_path
+def run_inference_streaming(
+    ckpt_dir,
+    wav2vec_dir,
+    model_type,
+    cond_image,
+    audio_path,
+    seed,
+    use_face_crop,
+    progress=gr.Progress(),
+):
+    """
+    流式推理：主程序监控 res_queue，有 frames 就保存并 yield；
+    推理在独立线程中执行，按 chunk 顺序 infer，结果放入 res_queue。
+    """
+    global pipeline, loaded_ckpt_dir, loaded_wav2vec_dir, loaded_model_type
+    if (
+        pipeline is None
+        or loaded_ckpt_dir != ckpt_dir
+        or loaded_wav2vec_dir != wav2vec_dir
+        or loaded_model_type != model_type
+    ):
+        progress(0.2, desc="Loading Model...")
+        logger.info(f"Loading pipeline with ckpt_dir={ckpt_dir}, wav2vec_dir={wav2vec_dir}")
+        try:
+            pipeline = get_pipeline(
+                world_size=1,
+                ckpt_dir=ckpt_dir,
+                model_type=model_type,
+                wav2vec_dir=wav2vec_dir,
+            )
+            loaded_ckpt_dir = ckpt_dir
+            loaded_wav2vec_dir = wav2vec_dir
+            loaded_model_type = model_type
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            raise gr.Error(f"Failed to load model: {e}")
+    progress(0.5, desc="Preparing Data...")
+    base_seed = int(seed) if seed >= 0 else 9999
+    try:
+        get_base_data(
+            pipeline,
+            cond_image_path_or_dir=cond_image,
+            base_seed=base_seed,
+            use_face_crop=use_face_crop,
+        )
+    except Exception as e:
+        logger.error(f"Error in get_base_data: {e}")
+        raise gr.Error(f"Error processing inputs: {e}")
+    infer_params = get_infer_params()
+    sample_rate = infer_params["sample_rate"]
+    tgt_fps = infer_params["tgt_fps"]
+    cached_audio_duration = infer_params["cached_audio_duration"]
+    frame_num = infer_params["frame_num"]
+    motion_frames_num = infer_params["motion_frames_num"]
+    slice_len = frame_num - motion_frames_num
+    try:
+        human_speech_array_all, _ = librosa.load(audio_path, sr=sample_rate, mono=True)
+    except Exception as e:
+        raise gr.Error(f"Failed to load audio file: {e}")
+    human_speech_array_slice_len = slice_len * sample_rate // tgt_fps
+    stream_dir = os.path.join("gradio_results", "stream_preview")
+    os.makedirs(stream_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S-%f")[:-3]
+    accumulated = []
+    # 默认使用 stream 模式：准备 chunk 切片
+    cached_audio_length_sum = sample_rate * cached_audio_duration
+    audio_end_idx = cached_audio_duration * tgt_fps
+    audio_start_idx = audio_end_idx - frame_num
+    remainder = len(human_speech_array_all) % human_speech_array_slice_len
+    if remainder > 0:
+        pad_length = human_speech_array_slice_len - remainder
+        human_speech_array_all = np.concatenate(
+            [human_speech_array_all, np.zeros(pad_length, dtype=human_speech_array_all.dtype)]
+        )
+    human_speech_array_slices = human_speech_array_all.reshape(-1, human_speech_array_slice_len)
+    total_chunks = len(human_speech_array_slices)
+    if total_chunks == 0:
+        raise gr.Error("Audio too short: no chunks to generate. Please use a longer audio.")
+    # Data prepare：按每 k 个 chunk 合并为一段 wav 保存（时间戳+segment_id 命名）
+    segment_audio_paths = {}
+    num_segments = (total_chunks + CHUNKS_PER_SEGMENT - 1) // CHUNKS_PER_SEGMENT
+    for segment_id in range(num_segments):
+        start = segment_id * CHUNKS_PER_SEGMENT
+        end = min(start + CHUNKS_PER_SEGMENT, total_chunks)
+        audio_concat = np.concatenate(
+            [human_speech_array_slices[i] for i in range(start, end)]
+        )
+        segment_audio_name = f"audio_{timestamp}_seg_{segment_id:04d}.wav"
+        segment_audio_path = os.path.join(stream_dir, segment_audio_name)
+        _save_chunk_audio_to_wav(
+            audio_concat,
+            segment_audio_path,
+            sample_rate=sample_rate,
+        )
+        segment_audio_paths[segment_id] = segment_audio_path
+    logger.info(
+        f"Pre-saved {num_segments} segment audios (every {CHUNKS_PER_SEGMENT} chunks) under {stream_dir}"
+    )
+    # 结果队列：推理线程放入 (chunk_idx, chunk_frames_np)，主线程根据 chunk_id 取对应音频合并
+    res_queue = queue.Queue()
+    def inference_worker():
+        """单独线程：按 chunk 顺序执行 infer，每生成一帧就放入 res_queue，立即继续下一 chunk。"""
+        audio_dq = deque([0.0] * cached_audio_length_sum, maxlen=cached_audio_length_sum)
+        for chunk_idx, human_speech_array in enumerate(human_speech_array_slices):
+            audio_dq.extend(human_speech_array.tolist())
+            audio_array = np.array(audio_dq)
+            audio_embedding = get_audio_embedding(pipeline, audio_array, audio_start_idx, audio_end_idx)
+            torch.cuda.synchronize()
+            start_time = time.time()
+            video = run_pipeline(pipeline, audio_embedding)
+            video = video[motion_frames_num:]
+            torch.cuda.synchronize()
+            logger.info(f"Infer chunk-{chunk_idx} done, cost time: {time.time() - start_time:.2f}s")
+            chunk_frames_np = video.cpu().numpy()
+            res_queue.put((chunk_idx, chunk_frames_np))
+        res_queue.put(None)  # 结束哨兵
+    worker_thread = threading.Thread(target=inference_worker)
+    worker_thread.start()
+    logger.info("Inference worker thread started. Main will consume res_queue and yield video paths.")
+    # 主程序：监控 res_queue，每凑满 k 个 chunk 合并为一段 mp4（含对应段音频）并 yield
+    frame_buffer = []
+    while True:
+        item = res_queue.get()
+        if item is None:
+            break
+        chunk_idx, chunk_frames_np = item
+        chunk_frames = torch.from_numpy(chunk_frames_np)
+        accumulated.append(chunk_frames)
+        frame_buffer.append(chunk_frames)
+        if len(frame_buffer) == CHUNKS_PER_SEGMENT:
+            segment_id = (chunk_idx + 1 - CHUNKS_PER_SEGMENT) // CHUNKS_PER_SEGMENT
+            segment_audio_path = segment_audio_paths[segment_id]
+            segment_path = os.path.join(
+                stream_dir, f"preview_{timestamp}_seg_{segment_id:04d}.mp4"
+            )
+            save_video_with_audio(
+                frame_buffer,
+                segment_path,
+                segment_audio_path,
+                fps=tgt_fps,
+            )
+            logger.info(
+                f"Saved segment-{segment_id} (chunks {segment_id * CHUNKS_PER_SEGMENT}-{chunk_idx}) and yielding to frontend."
+            )
+            yield os.path.abspath(segment_path)
+            frame_buffer = []
+    # 不足 k 的剩余 chunk 合并为最后一段
+    if frame_buffer:
+        segment_id = num_segments - 1
+        segment_audio_path = segment_audio_paths[segment_id]
+        segment_path = os.path.join(
+            stream_dir, f"preview_{timestamp}_seg_{segment_id:04d}.mp4"
+        )
+        save_video_with_audio(
+            frame_buffer,
+            segment_path,
+            segment_audio_path,
+            fps=tgt_fps,
+        )
+        logger.info(
+            f"Saved final segment-{segment_id} ({len(frame_buffer)} chunks) and yielding to frontend."
+        )
+        yield os.path.abspath(segment_path)
+    worker_thread.join()
+    if not accumulated:
+        raise gr.Error("No video frames generated. Please check inputs and try again.")
+    output_dir = "gradio_results"
+    os.makedirs(output_dir, exist_ok=True)
+    final_filename = f"res_{timestamp}.mp4"
+    final_path = os.path.join(output_dir, final_filename)
+    save_video_with_audio(accumulated, final_path, audio_path, fps=tgt_fps)
+    logger.info(f"Saved to {final_path}")
+# ---------- Gradio UI ----------
+with gr.Blocks(title="SoulX-FlashHead 流式视频生成", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# ⚡ SoulX-FlashHead 流式视频生成")
+    gr.Markdown("上传图片与音频，边生成边播放，音画同步。当前仅支持单GPU。")
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group():
+                gr.Markdown("### 🎬 生成输入")
+                with gr.Row():
+                    cond_image_input = gr.Image(
+                        label="Condition Image",
+                        type="filepath",
+                        value="examples/girl.png",
+                        height=300,
+                    )
+                    audio_path_input = gr.Audio(
+                        label="Audio Input",
+                        type="filepath",
+                        value="examples/podcast_sichuan_16k.wav",
+                    )
+            generate_btn = gr.Button("🚀 流式生成视频", variant="primary", size="lg")
+            with gr.Accordion("⚙️ 高级设置", open=False):
+                ckpt_dir_input = gr.Textbox(
+                    label="FlashHead Checkpoint Directory",
+                    value="models/SoulX-FlashHead-1_3B",
+                )
+                wav2vec_dir_input = gr.Textbox(
+                    label="Wav2Vec Directory",
+                    value="models/wav2vec2-base-960h",
+                )
+                model_type_input = gr.Dropdown(
+                    label="Model Type",
+                    choices=["pro", "lite"],
+                    value="lite",
+                )
+                use_face_crop_input = gr.Checkbox(label="Use Face Crop", value=False)
+                seed_input = gr.Number(label="Random Seed", value=9999, precision=0)
+        with gr.Column(scale=1):
+            gr.Markdown("### 📺 输出视频（流式更新）")
+            video_output = gr.Video(
+                label="Generated Video",
+                height=512,
+                format="mp4",
+                streaming=True,
+                autoplay=True,
+            )
+    generate_btn.click(
+        fn=run_inference_streaming,
+        inputs=[
+            ckpt_dir_input,
+            wav2vec_dir_input,
+            model_type_input,
+            cond_image_input,
+            audio_path_input,
+            seed_input,
+            use_face_crop_input,
+        ],
+        outputs=video_output,
+    )
+if __name__ == "__main__":
+    app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+torch==2.7.1
+opencv-python>=4.12.0.88
+opencv-python-headless>=4.12.0.88
+diffusers>=0.34.0
+transformers==4.57.3
+tokenizers>=0.20.3
+accelerate>=1.8.1
+tqdm
+imageio
+easydict
+ftfy
+imageio-ffmpeg
+scikit-image
+loguru
+gradio==5.50.0
+xfuser>=0.4.3
+pyloudnorm
+decord
+xformers==0.0.31
+librosa
+mediapipe==0.10.9
+flask
+huggingface_hub