Spaces:

XiangpengYang
/

VideoCoF

Running on Zero

App Files Files Community

XiangpengYang commited on Dec 11, 2025

Commit

42a2bfa

1 Parent(s): 64a0f40

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +4 -0
LICENSE +201 -0
README.md +209 -14
__init__.py +3 -0
app.py +391 -0
assets/dough.mp4 +3 -0
assets/sign.mp4 +3 -0
assets/teaser_test.json +20 -0
assets/two_man.mp4 +3 -0
assets/woman_ballon.mp4 +3 -0
config/1.3b_lora_zero_stage2_config.json +24 -0
config/14b_lora_zero2_bf16_config.json +24 -0
config/wan2.1/wan_civitai.yaml +39 -0
config/wan2.2/wan_civitai_5b.yaml +41 -0
config/wan2.2/wan_civitai_i2v.yaml +43 -0
config/wan2.2/wan_civitai_s2v.yaml +44 -0
config/wan2.2/wan_civitai_t2v.yaml +43 -0
config/zero_stage2_config.json +16 -0
config/zero_stage3_config.json +27 -0
config/zero_stage3_config_cpu_offload.json +28 -0
inference.py +400 -0
install.py +45 -0
pyproject.toml +15 -0
requirements.txt +26 -0
scripts/local_style.sh +13 -0
scripts/obj_add.sh +13 -0
scripts/obj_rem.sh +13 -0
scripts/parallel_infer.sh +12 -0
videox_fun/__init__.py +0 -0
videox_fun/api/api.py +226 -0
videox_fun/api/api_multi_nodes.py +320 -0
videox_fun/data/bucket_sampler.py +392 -0
videox_fun/data/dataset_image.py +76 -0
videox_fun/data/dataset_image_video.py +1939 -0
videox_fun/data/dataset_video.py +262 -0
videox_fun/dist/__init__.py +66 -0
videox_fun/dist/cogvideox_xfuser.py +105 -0
videox_fun/dist/flux_xfuser.py +168 -0
videox_fun/dist/fsdp.py +44 -0
videox_fun/dist/fuser.py +55 -0
videox_fun/dist/qwen_xfuser.py +176 -0
videox_fun/dist/wan_xfuser.py +180 -0
videox_fun/pipeline/__init__.py +21 -0
videox_fun/pipeline/pipeline_wan.py +799 -0
videox_fun/pipeline/pipeline_wan2_2.py +591 -0
videox_fun/ui/cogvideox_fun_ui.py +722 -0
videox_fun/ui/controller.py +514 -0
videox_fun/ui/ui.py +366 -0
videox_fun/ui/wan2_2_fun_ui.py +803 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+samples/
+models/
+__pycache__/
+*.pyc

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,14 +1,209 @@
----
-title: VideoCoF
-emoji: 📉
-colorFrom: gray
-colorTo: green
-sdk: gradio
-sdk_version: 6.1.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Unified Video Editing with Temporal Reasoner
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+  <h1 style="margin: 0; font-size: 2.4em;">
+    Unified Video Editing with Temporal Reasoner
+  </h1>
+  <h4 style="margin: 15px 0; color: #2c3e50;">
+    👁️ See &rarr; 🧠 Reason &rarr; ✏️ Edit
+  </h4>
+  <h4 style="margin: 15px 0; color: #2c3e50;">
+    🚀 A Chain of Frames video editing method enbale temporal reasoning and 4x video length extrapolation with just 50k training pairs!
+  </h4>
+  [![Hugging Face Daily Paper](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Daily%20Paper-yellow)](https://huggingface.co/papers/2512.07469)
+  [![arXiv](https://img.shields.io/badge/arXiv-2512.07469-b31b1b.svg)](https://arxiv.org/abs/2512.07469)
+  [![Project Page](https://img.shields.io/badge/Project-Page-green)](https://videocof.github.io)
+  [![Hugging Face Model](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-yellow)](https://huggingface.co/XiangpengYang/VideoCoF)
+  ![visitors](https://visitor-badge.laobi.icu/badge?page_id=videocof.VideoCoF&left_color=green&right_color=red)
+</div>
+<div align="center">
+  <b>
+    <a href="https://scholar.google.com/citations?user=reiIeYMAAAAJ">Xiangpeng Yang</a><sup>1</sup>,
+    <a href="https://horizonwind2004.github.io/">Ji Xie</a><sup>2</sup>,
+    <a href="https://scholar.google.com/citations?user=OvfI_HMAAAAJ">Yiyuan Yang</a><sup>1</sup>,
+    <a href="https://scholar.google.com/citations?user=zfeWd6gAAAAJ">Yan Huang</a><sup>1</sup>,
+    <a href="https://scholar.google.com/citations?user=sCuACdkAAAAJ">Min Xu</a><sup>1</sup>,
+    <a href="https://scholar.google.com/citations?user=sCuACdkAAAAJ">Qiang Wu</a><sup>1</sup>
+  </b>
+  <br>
+  <span style="font-size: 1em; color: #555;"><sup>1</sup>University of Technology Sydney, <sup>2</sup>Zhejiang University</span>
+</div>
+<br>
+## 💿 Introduction
+https://github.com/user-attachments/assets/26f7d347-3d6c-43cf-9645-6eb5906f6ad6
+## 🔥 News
+- **2025.12.09**: Paper available on arXiv.
+- **2025.12.08**: Release the inference code and videocof-50k weight.
+- **2025.12.06**: 🔥 Project Page and README updated!
+## 📑 Table of Contents
+- [🔧 Quick Start](#-quick-start)
+- [🏆 Model Zoo](#-model-zoo)
+- [🍭 Results](#-results)
+- [🎨 Edit Comparison](#-edit-comparison)
+- [🚧 TODO](#-todo)
+- [🙏 Acknowledgments](#-acknowledgments)
+- [📜 License](#-license)
+- [📮 Contact](#-contact)
+- [📄 Citation](#-citation)
+## 🔧 Quick Start
+1.  **Clone the repository:**
+    ```bash
+    git clone https://github.com/videocof/VideoCoF.git
+    cd VideoCoF
+    ```
+2.  **Install dependencies:**
+    ```bash
+    # 1. Create and activate a conda environment
+    conda create -n videocof python=3.10
+    conda activate videocof
+    # 2. Install PyTorch (Choose version compatible with your CUDA)
+    # For standard GPUs (CUDA 12.1):
+    pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+    # For Hopper GPUs (e.g., H100/H800) requiring fast inference:
+    # pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128
+    # 3. Install other dependencies
+    pip install -r requirements.txt
+    ```
+    **Note on Flash Attention:**
+    We recommend using **FlashAttention-3** (currently beta) for optimal performance, especially on NVIDIA H100/H800 GPUs.
+    If you are using these GPUs, please follow the [official FlashAttention-3 installation guide](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#flashattention-3-beta-release) after installing the compatible PyTorch version (e.g., PyTorch 2.8 + CUDA 12.8).
+3.  **Download Models:**
+    **Wan-2.1-T2V-14B Pretrained Weights:**
+        ```bash
+        git lfs install
+        git clone https://huggingface.co/Wan-AI/Wan2.1-T2V-14B
+        # Or using huggingface-cli:
+        # hf download Wan-AI/Wan2.1-T2V-14B --local-dir Wan2.1-T2V-14B
+        ```
+    **VideoCoF Checkpoint:**
+        ```bash
+        git lfs install
+        git clone https://huggingface.co/XiangpengYang/VideoCoF videocof_weight
+        # Or using huggingface-cli:
+        # hf download XiangpengYang/VideoCoF --local-dir videocof_weight
+        ```
+4.  **Inference:**
+    For single inference tasks:
+    ```bash
+    # Object Removal
+    sh scripts/obj_rem.sh
+    # Object Addition
+    sh scripts/obj_add.sh
+    # Local Style Transfer
+    sh scripts/local_style.sh
+    ```
+    For parallel inference:
+    ```bash
+    sh scripts/parallel_infer.sh
+    ```
+## 🏆 Model Zoo
+Our models are available on Hugging Face:
+| Model Name | Description | Link |
+|------------|-------------|------|
+| VideoCoF-Base | Base model trained on 50k video pairs | [Hugging Face](https://huggingface.co/XiangpengYang/VideoCoF) |
+## 🍭 Results
+### Why We Need Reasoning Before Editing?
+![](assets/motivation_v2.gif)
+Current video editing methods typically follow two paths:
+1.  **Expert models**: Rely on external masks for precision but sacrifice unification.
+2.  **Unified in-context learning models**: Mask-free but often struggle with spatial accuracy due to the lack of explicit cues.
+**VideoCoF** bridges this gap by predicting reasoning tokens before generating the target video tokens.
+### Key Capabilities
+1.  **Seeing, Reasoning, Editing**: VideoCoF adopts a "seeing, reasoning, editing" approach, ensuring edits are applied accurately to the intended targets.
+2.  **Length Extrapolation**: Trained on only **50k** data (33 frames), VideoCoF demonstrates robust multi-shot editing and length generalization (e.g., 4&times; length extrapolation).
+3.  **Diverse Editing Tasks**: Supports fine-grained (instance and part level, spatial aware) Object Removal, Object Addition, Object Swap, and Local Style Transfer.
+### Gallery Highlights
+> Please refer to our [Project Page](https://videocof.github.io) for the full gallery.
+*   **Object Removal**: Remove people or objects based on text prompts.
+*   **Object Addition**: Add elements like animals, objects, or people.
+*   **Object Swap**: Change specific attributes or objects.
+*   **Local Style Transfer**: Modify texture, materials or colors.
+## 🚧 TODO
+- [x] Release paper.
+- [x] Release inference code and weights.
+- [ ] Release training code.
+- [ ] Release training data.
+- [ ] Add Hugging Face demo.
+## 🙏 Acknowledgments
+We thank the authors of related works and the open-source community [VideoX-Fun](https://github.com/aigc-apps/VideoX-Fun) and [Wan](https://github.com/Wan-Video/Wan2.1) for their contributions.
+## 📜 License
+This project is licensed under the [Apache License 2.0](LICENSE).
+## 📮 Contact
+For any questions, please feel free to reach out to the author Xiangpeng Yang [@knightyxp](https://github.com/knightyxp), email: knightyxp@gmail.com/Xiangpeng.Yang@student.uts.edu.au
+## 📄 Citation
+If you find this work useful for your research, please consider citing:
+```bibtex
+@article{yang2025videocof,
+  title={Unified Video Editing with Temporal Reasoner},
+  author={Yang, Xiangpeng and Xie, Ji and Yang, Yiyuan and Huang, Yan and Xu, Min and Wu, Qiang},
+  journal={arXiv preprint arXiv:2512.07469},
+  year={2025}
+}
+```
+<div align="center">
+  ⭐ **If you find this project helpful, please consider giving it a star!** ⭐
+</div>
+## ⭐️ Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=knightyxp/VideoCoF&type=Date&legend=top-left)](https://star-history.com/#knightyxp/VideoCoF&Date)

__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .comfyui.comfyui_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2	+
3	+ __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

app.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import os
+import sys
+import time
+import torch
+import gradio as gr
+import numpy as np
+import imageio
+from PIL import Image
+# Add project root to path
+# current_file_path = os.path.abspath(__file__)
+# project_root = os.path.dirname(os.path.dirname(current_file_path))
+# if project_root not in sys.path:
+#    sys.path.insert(0, project_root)
+from videox_fun.ui.wan_ui import Wan_Controller, css
+from videox_fun.ui.ui import (
+    create_model_type, create_model_checkpoints, create_finetune_models_checkpoints,
+    create_teacache_params, create_cfg_skip_params, create_cfg_riflex_k,
+    create_prompts, create_samplers, create_height_width,
+    create_generation_methods_and_video_length, create_generation_method,
+    create_cfg_and_seedbox, create_ui_outputs
+)
+from videox_fun.data.dataset_image_video import derive_ground_object_from_instruction
+from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
+from videox_fun.utils.utils import save_videos_grid, timer
+# Redefine create_height_width to remove Chinese and specific defaults if needed,
+# although we will mostly ignore sliders if we use input resolution.
+# We will create a custom version here to avoid modifying the library file if possible,
+# or we just rely on `create_height_width` and update labels.
+# But `create_height_width` is imported. Let's override it or create a new one.
+def create_height_width_english(default_height, default_width, maximum_height, maximum_width):
+    resize_method = gr.Radio(
+        ["Generate by", "Resize according to Reference"],
+        value="Generate by",
+        show_label=False,
+        visible=False # Hide since we force input resolution
+    )
+    # We keep sliders visible but maybe we can update them dynamically or just ignore them?
+    # User requested "input is whatever resolution, inference is whatever resolution".
+    # So we can hide these or just label them as "Default / Override if no video".
+    # But better to hide them if we always use video resolution.
+    # However, if no video is provided (which shouldn't happen for VideoCoF), we might need them.
+    # Let's keep them but make them less prominent or explain.
+    # Actually user said "no default 480x832", implying don't force it.
+    width_slider     = gr.Slider(label="Width", value=default_width, minimum=128, maximum=maximum_width, step=16, visible=False)
+    height_slider    = gr.Slider(label="Height", value=default_height, minimum=128, maximum=maximum_height, step=16, visible=False)
+    base_resolution  = gr.Radio(label="Base Resolution", value=512, choices=[512, 640, 768, 896, 960, 1024], visible=False)
+    return resize_method, width_slider, height_slider, base_resolution
+def load_video_frames(video_path: str, source_frames: int):
+    assert source_frames is not None, "source_frames is required"
+    reader = imageio.get_reader(video_path)
+    try:
+        total_frames = reader.count_frames()
+    except Exception:
+        total_frames = sum(1 for _ in reader)
+        reader = imageio.get_reader(video_path)
+    stride = max(1, total_frames // source_frames)
+    # Using random start frame as in inference.py
+    start_frame = torch.randint(0, max(1, total_frames - stride * source_frames), (1,))[0].item()
+    frames = []
+    original_height, original_width = None, None
+    for i in range(source_frames):
+        idx = start_frame + i * stride
+        if idx >= total_frames:
+            break
+        try:
+            frame = reader.get_data(idx)
+            pil_frame = Image.fromarray(frame)
+            if original_height is None:
+                original_width, original_height = pil_frame.size
+            frames.append(pil_frame)
+        except IndexError:
+            break
+    reader.close()
+    while len(frames) < source_frames:
+        if frames:
+            frames.append(frames[-1].copy())
+        else:
+            w, h = (original_width, original_height) if original_width else (832, 480)
+            frames.append(Image.new('RGB', (w, h), (0, 0, 0)))
+    input_video = torch.from_numpy(np.array(frames))
+    input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0).float()
+    input_video = input_video * (2.0 / 255.0) - 1.0
+    return input_video, original_height, original_width
+class VideoCoF_Controller(Wan_Controller):
+    @timer
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        resize_method,
+        width_slider,
+        height_slider,
+        base_resolution,
+        generation_method,
+        length_slider,
+        overlap_video_length,
+        partial_video_length,
+        cfg_scale_slider,
+        start_image,
+        end_image,
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        ref_image=None,
+        enable_teacache=None,
+        teacache_threshold=None,
+        num_skip_start_steps=None,
+        teacache_offload=None,
+        cfg_skip_ratio=None,
+        enable_riflex=None,
+        riflex_k=None,
+        # Custom args
+        source_frames_slider=33,
+        reasoning_frames_slider=4,
+        repeat_rope_checkbox=True,
+        fps=10,
+        is_api=False,
+    ):
+        self.clear_cache()
+        print(f"VideoCoF Generation started.")
+        if self.diffusion_transformer_dropdown != diffusion_transformer_dropdown:
+            self.update_diffusion_transformer(diffusion_transformer_dropdown)
+        if self.base_model_path != base_model_dropdown:
+            self.update_base_model(base_model_dropdown)
+        if self.lora_model_path != lora_model_dropdown:
+            self.update_lora_model(lora_model_dropdown)
+        # Scheduler setup
+        scheduler_config = self.pipeline.scheduler.config
+        if sampler_dropdown in ["Flow_Unipc", "Flow_DPM++"]:
+            scheduler_config['shift'] = 1
+        self.pipeline.scheduler = self.scheduler_dict[sampler_dropdown].from_config(scheduler_config)
+        # LoRA merging
+        if self.lora_model_path != "none":
+            print(f"Merge Lora.")
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+        # Seed
+        if int(seed_textbox) != -1 and seed_textbox != "":
+            torch.manual_seed(int(seed_textbox))
+        else:
+            seed_textbox = np.random.randint(0, 1e10)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed_textbox))
+        try:
+            # VideoCoF logic
+            # Use validation_video as source if provided (UI standard for Video-to-Video)
+            input_video_path = validation_video
+            if input_video_path is None:
+                # Fallback to control_video if set, but standard UI uses validation_video
+                input_video_path = control_video
+            if input_video_path is None:
+                raise ValueError("Please upload a video for VideoCoF generation.")
+            # CoT Prompt Construction
+            edit_text = prompt_textbox
+            ground_instr = derive_ground_object_from_instruction(edit_text)
+            prompt = (
+                "A video sequence showing three parts: first the original scene, "
+                f"then grounded {ground_instr}, and finally the same scene but {edit_text}"
+            )
+            print(f"Constructed prompt: {prompt}")
+            # Load video frames
+            input_video_tensor, video_height, video_width = load_video_frames(
+                input_video_path,
+                source_frames=source_frames_slider
+            )
+            # Using loaded video dimensions
+            h, w = video_height, video_width
+            print(f"Input video dimensions: {w}x{h}")
+            print(f"Running pipeline with frames={length_slider}, source={source_frames_slider}, reasoning={reasoning_frames_slider}")
+            sample = self.pipeline(
+                video=input_video_tensor,
+                prompt=prompt,
+                num_frames=length_slider,
+                source_frames=source_frames_slider,
+                reasoning_frames=reasoning_frames_slider,
+                negative_prompt=negative_prompt_textbox,
+                height=h,
+                width=w,
+                generator=generator,
+                guidance_scale=cfg_scale_slider,
+                num_inference_steps=sample_step_slider,
+                repeat_rope=repeat_rope_checkbox,
+                cot=True,
+            ).videos
+            final_video = sample
+        except Exception as e:
+            print(f"Error: {e}")
+            if self.lora_model_path != "none":
+                 self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            return gr.update(), gr.update(), f"Error: {str(e)}"
+        # Unmerge LoRA
+        if self.lora_model_path != "none":
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+        # Save output
+        save_sample_path = self.save_outputs(
+            False, length_slider, final_video, fps=fps
+        )
+        # Return input video to display it alongside output if needed?
+        # But generate returns [result_image, result_video, infer_progress].
+        # The user said "load original video didn't display".
+        # That usually refers to the input component not showing the video after upload or example selection.
+        # Grado handles that automatically if `value` is set or user uploads.
+        # Maybe they mean the `validation_video` component didn't show the example?
+        # Or do they mean they want to see the processed input frames?
+        # "load 原视频没有display 出来" -> "Loaded original video didn't display".
+        # Likely referring to the input UI component.
+        # If they mean they want to see it in the output area, we can't easily change the return signature without changing UI structure.
+        # But let's ensure the input component works.
+        return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+def ui(GPU_memory_mode, scheduler_dict, config_path, compile_dit, weight_dtype):
+    controller = VideoCoF_Controller(
+        GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
+        config_path=config_path, compile_dit=compile_dit,
+        weight_dtype=weight_dtype
+    )
+    with gr.Blocks() as demo:
+        gr.Markdown("# VideoCoF Demo")
+        with gr.Column(variant="panel"):
+            # Hide model selection
+            diffusion_transformer_dropdown, _ = create_model_checkpoints(controller, visible=False, default_model="Wan-AI/Wan2.1-T2V-14B")
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider, _ = create_finetune_models_checkpoints(controller, visible=False, default_lora="XiangpengYang/VideoCoF")
+            # Set default LoRA alpha to 1.0 (matching inference.py)
+            lora_alpha_slider.value = 1.0
+            with gr.Row():
+                # Disable teacache by default
+                enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload = create_teacache_params(False, 0.10, 5, False)
+                cfg_skip_ratio = create_cfg_skip_params(0)
+                enable_riflex, riflex_k = create_cfg_riflex_k(False, 6)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(prompt="Remove the young man with short black hair wearing black shirt on the left.")
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+                    # Custom VideoCoF Params
+                    with gr.Group():
+                        gr.Markdown("### VideoCoF Parameters")
+                        source_frames_slider = gr.Slider(label="Source Frames", minimum=1, maximum=100, value=33, step=1)
+                        reasoning_frames_slider = gr.Slider(label="Reasoning Frames", minimum=1, maximum=20, value=4, step=1)
+                        repeat_rope_checkbox = gr.Checkbox(label="Repeat RoPE", value=True)
+                    # Use custom height/width creation to hide/customize
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width_english(
+                        default_height=480, default_width=832, maximum_height=1344, maximum_width=1344
+                    )
+                    # Default video length 65
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation"],
+                            default_video_length=65,
+                            maximum_video_length=161
+                        )
+                    # Simplified input for VideoCoF - mainly Video to Video.
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Video to Video"], prompt_textbox, support_end_image=False, default_video="assets/two_man.mp4",
+                        video_examples=[
+                            ["assets/two_man.mp4", "Remove the young man with short black hair wearing black shirt on the left."],
+                            ["assets/sign.mp4", "Replace the yellow \"SCHOOL\" sign with a red hospital sign, featuring a white hospital emblem on the top and the word \"HOSPITAL\" below."]
+                        ]
+                    )
+                    # Ensure validation_video is visible and interactive
+                    validation_video.visible = True
+                    validation_video.interactive = True
+                    # Set default seed to 0
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(True)
+                    seed_textbox.value = "0"
+                    generate_button = gr.Button(value="Generate", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+        # Event handlers
+        generate_button.click(
+            fn=controller.generate,
+            inputs=[
+                diffusion_transformer_dropdown,
+                base_model_dropdown,
+                lora_model_dropdown,
+                lora_alpha_slider,
+                prompt_textbox,
+                negative_prompt_textbox,
+                sampler_dropdown,
+                sample_step_slider,
+                resize_method,
+                width_slider,
+                height_slider,
+                base_resolution,
+                generation_method,
+                length_slider,
+                overlap_video_length,
+                partial_video_length,
+                cfg_scale_slider,
+                start_image,
+                end_image,
+                validation_video,
+                validation_video_mask,
+                control_video,
+                denoise_strength,
+                seed_textbox,
+                ref_image,
+                enable_teacache,
+                teacache_threshold,
+                num_skip_start_steps,
+                teacache_offload,
+                cfg_skip_ratio,
+                enable_riflex,
+                riflex_k,
+                # New inputs
+                source_frames_slider,
+                reasoning_frames_slider,
+                repeat_rope_checkbox
+            ],
+            outputs=[result_image, result_video, infer_progress]
+        )
+    return demo, controller
+if __name__ == "__main__":
+    from videox_fun.ui.controller import flow_scheduler_dict
+    GPU_memory_mode = "sequential_cpu_offload"
+    compile_dit = False
+    weight_dtype = torch.bfloat16
+    server_name = "0.0.0.0"
+    server_port = 7860
+    config_path = "config/wan2.1/wan_civitai.yaml"
+    demo, controller = ui(GPU_memory_mode, flow_scheduler_dict, config_path, compile_dit, weight_dtype)
+    demo.queue(status_update_rate=1).launch(
+        server_name=server_name,
+        server_port=server_port,
+        prevent_thread_lock=True,
+        share=False
+    )
+    while True:
+        time.sleep(5)

assets/dough.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5262cf58ffa08dcd79d6346abec46bc0234aebfc65905b5ea2ca4ab905ca9dac
+size 185700

assets/sign.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e94f03a7d5738a001ce2e1302a8ae65596431a647dbfed83cdb6876322175a7
+size 100798

assets/teaser_test.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+    {
+        "task_type": "obj_add",
+        "sample_id": "001",
+        "source_video_path": "assets/woman_ballon.mp4",
+        "qwen_vl_72b_refined_instruction": "Add the woman in a floral dress pointing at the balloon on the left."
+      },
+      {
+        "task_type": "obj_rem",
+        "sample_id": "001",
+        "source_video_path": "assets/two_man.mp4",
+        "qwen_vl_72b_refined_instruction": "Remove the young man with short black hair wearing black shirt on the left."
+      },
+      {
+        "task_type": "local_style",
+        "sample_id": "001",
+        "source_video_path": "assets/sign.mp4",
+        "qwen_vl_72b_refined_instruction": "Replace the yellow \"SCHOOL\" sign with a red hospital sign, featuring a white hospital emblem on the top and the word \"HOSPITAL\" below."
+      }
+]

assets/two_man.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd9c0f6523207bbcf0d5159beb7f7eaf37811e6e5b7a53585dda50491a573cd9
+size 303233

assets/woman_ballon.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:575b37abda414161179bc00e0e7b6893f28feb967e875c8f9676275d2cc32572
+size 89085

config/1.3b_lora_zero_stage2_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": 4,
+    "train_batch_size": 64,
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": 0.05,
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none"
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 5e8,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "reduce_scatter": true
+    },
+    "steps_per_print": 100,
+    "wall_clock_breakdown": false
+}

config/14b_lora_zero2_bf16_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": 0.05,
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none"
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 5e8,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "reduce_scatter": true
+    },
+    "steps_per_print": 100,
+    "wall_clock_breakdown": false
+}

config/wan2.1/wan_civitai.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_subpath: ./
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_subpath: Wan2.1_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 8
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 5.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

config/wan2.2/wan_civitai_5b.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_low_noise_model_subpath: ./
+  transformer_combination_type: "single"
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_type: "AutoencoderKLWan3_8"
+  vae_subpath: Wan2.2_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 16
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 5.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

config/wan2.2/wan_civitai_i2v.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_low_noise_model_subpath: ./low_noise_model
+  transformer_high_noise_model_subpath: ./high_noise_model
+  transformer_combination_type: "moe"
+  boundary: 0.900
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_type: "AutoencoderKLWan"
+  vae_subpath: Wan2.1_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 8
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 5.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

config/wan2.2/wan_civitai_s2v.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_low_noise_model_subpath: ./
+  transformer_combination_type: "single"
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_type: "AutoencoderKLWan"
+  vae_subpath: Wan2.1_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 8
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+audio_encoder_kwargs:
+  audio_encoder_subpath: wav2vec2-large-xlsr-53-english
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 3.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

config/wan2.2/wan_civitai_t2v.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_low_noise_model_subpath: ./low_noise_model
+  transformer_high_noise_model_subpath: ./high_noise_model
+  transformer_combination_type: "moe"
+  boundary: 0.875
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_type: "AutoencoderKLWan"
+  vae_subpath: Wan2.1_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 8
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 12.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

config/zero_stage2_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "bf16": {
+      "enabled": true
+  },
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": "auto",
+  "gradient_accumulation_steps": "auto",
+  "dump_state": true,
+  "zero_optimization": {
+      "stage": 2,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1e9,
+      "reduce_bucket_size": 5e8
+  }
+}

config/zero_stage3_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "bf16": {
+      "enabled": true
+  },
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 3,
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "reduce_bucket_size": 5e8,
+      "sub_group_size": 1e9,
+      "stage3_max_live_parameters": 1e9,
+      "stage3_max_reuse_distance": 1e9,
+      "stage3_gather_16bit_weights_on_model_save": "auto",
+      "offload_optimizer": {
+          "device": "none"
+      },
+      "offload_param": {
+          "device": "none"
+      }
+  }
+}

config/zero_stage3_config_cpu_offload.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false,
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "offload_optimizer": {
+            "device": "cpu"
+        },
+        "offload_param": {
+            "device": "cpu"
+        }
+    }
+}

inference.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import os
+import sys
+import json
+import argparse
+import numpy as np
+import torch
+import torch.distributed as dist
+from diffusers import FlowMatchEulerDiscreteScheduler
+from omegaconf import OmegaConf
+from PIL import Image
+import imageio
+current_file_path = os.path.abspath(__file__)
+project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
+for project_root in project_roots:
+    sys.path.insert(0, project_root) if project_root not in sys.path else None
+from videox_fun.models import (AutoencoderKLWan, WanT5EncoderModel, AutoTokenizer,
+                              WanTransformer3DModel)
+from videox_fun.pipeline import WanPipeline
+from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8, replace_parameters_by_name,
+                                              convert_weight_dtype_wrapper)
+from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
+from videox_fun.utils.utils import (filter_kwargs, save_videos_grid)
+from videox_fun.data.dataset_image_video import derive_ground_object_from_instruction
+from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
+from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+def load_video_frames(
+    video_path: str,
+    source_frames: int = None,
+):
+    assert source_frames is not None, "请传入 source_frames"
+    reader = imageio.get_reader(video_path)
+    try:
+        total_frames = reader.count_frames()
+    except Exception:
+        total_frames = sum(1 for _ in reader)
+        reader = imageio.get_reader(video_path)
+    stride = max(1, total_frames // source_frames)
+    start_frame = torch.randint(0, max(1, total_frames - stride * source_frames), (1,))[0].item()
+    frames = []
+    original_height, original_width = None, None
+    for i in range(source_frames):
+        idx = start_frame + i * stride
+        if idx >= total_frames:
+            break
+        try:
+            frame = reader.get_data(idx)
+            pil_frame = Image.fromarray(frame)
+            if original_height is None:
+                original_width, original_height = pil_frame.size
+                print(f"Original video dimensions: {original_width}x{original_height}")
+            frames.append(pil_frame)
+        except IndexError:
+            break
+    reader.close()
+    while len(frames) < source_frames:
+        if frames:
+            frames.append(frames[-1].copy())
+        else:
+            w, h = (original_width, original_height) if original_width else (832, 480)
+            frames.append(Image.new('RGB', (w, h), (0, 0, 0)))
+    assert len(frames) == source_frames
+    print(f"Loaded {source_frames} source frames")
+    input_video = torch.from_numpy(np.array(frames))
+    input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0).float()
+    input_video = input_video * (2.0 / 255.0) - 1.0
+    return input_video, original_height, original_width
+def parse_args():
+    parser = argparse.ArgumentParser(description="Video-to-video CoT reasoning generation from JSON task list with parallel inference")
+    parser.add_argument("--test_json", type=str, default=None, help="Path to test JSON file for batch inference")
+    parser.add_argument("--prompt", type=str, default=None, help="Text prompt for editing (single mode)")
+    parser.add_argument("--video_path", type=str, default=None, help="Path to input video (single mode)")
+    parser.add_argument("--model_name", type=str, default="/scratch3/yan204/models/Wan2.1-T2V-14B", help="Model checkpoint path")
+    parser.add_argument("--output_dir", type=str, required=True, help="Output directory for generated videos")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducible generation")
+    parser.add_argument("--videocof_path", type=str, default=None, help="Path to videocof weight checkpoint")
+    parser.add_argument("--num_frames", type=int, default=65, help="Total number of frames (input + generated)")
+    parser.add_argument("--source_frames", type=int, default=33, help="Number of source frames; default 33")
+    parser.add_argument("--reasoning_frames", type=int, default=4, help="Grounding frames in the middle segment (pixel-space)")
+    parser.add_argument("--repeat_rope", action="store_true", help="Enable repeat temporal RoPE for src and tgt segments")
+    return parser.parse_args()
+# Defaults aligned with predict_v2v_json_new.py
+GPU_memory_mode = "sequential_cpu_offload"
+ulysses_degree = 1
+ring_degree = 1
+fsdp_dit = False
+fsdp_text_encoder = True
+compile_dit = False
+enable_teacache = True
+teacache_threshold = 0.10
+num_skip_start_steps = 5
+teacache_offload = False
+cfg_skip_ratio = 0
+enable_riflex = False
+riflex_k = 6
+config_path = "config/wan2.1/wan_civitai.yaml"
+model_name = "/scratch3/yan204/models/Wan2.1-T2V-14B"
+sampler_name = "Flow_Unipc"
+shift = 3
+transformer_path = None
+vae_path = None
+fps = 10
+weight_dtype = torch.bfloat16
+negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+guidance_scale = 5.0
+num_inference_steps = 50
+lora_weight = 1.0
+def save_results(tensor: torch.Tensor, file_path: str, fps_out: int = 16):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    B, C, T, H, W = tensor.shape
+    arr = tensor[0].cpu().numpy()
+    if T == 1:
+        img = arr[:, 0].transpose(1, 2, 0)
+        img = (img * 255).astype(np.uint8)
+        Image.fromarray(img).save(file_path)
+    else:
+        save_videos_grid(tensor, file_path, fps=fps_out)
+    print(f"Saved video → {file_path}")
+def _normalize_to_01(video: torch.Tensor) -> torch.Tensor:
+    with torch.no_grad():
+        vmin = float(video.min())
+        vmax = float(video.max())
+        if vmin < 0.0 or vmax > 1.0:
+            video = (video + 1.0) / 2.0
+        return video.clamp(0.0, 1.0)
+def save_side_by_side(input_tensor: torch.Tensor, sample_tensor: torch.Tensor, file_path: str, fps_out: int = 16):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    a = _normalize_to_01(input_tensor.detach().cpu())
+    b = _normalize_to_01(sample_tensor.detach().cpu())
+    # Align dimensions by cropping to the minimum across T/H/W
+    T = min(a.shape[2], b.shape[2])
+    H = min(a.shape[3], b.shape[3])
+    W = min(a.shape[4], b.shape[4])
+    a = a[:, :, :T, :H, :W]
+    b = b[:, :, :T, :H, :W]
+    combined = torch.cat([a, b], dim=4)
+    save_videos_grid(combined, file_path, fps=fps_out)
+    print(f"Saved side-by-side video → {file_path}")
+def derive_ground_instruction(edit_instruction_text: str) -> str:
+    # Keep wrapper for backward compatibility; reuse the same rule as training dataset
+    return derive_ground_object_from_instruction(edit_instruction_text)
+def main():
+    args = parse_args()
+    # Initialize DDP
+    dist.init_process_group(backend="nccl")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    local_rank = int(os.environ.get("LOCAL_RANK", rank % max(1, torch.cuda.device_count())))
+    torch.cuda.set_device(local_rank)
+    if rank == 0:
+        print(f"Running parallel CoT inference with {world_size} GPUs")
+        print(f"Using seed: {args.seed}")
+    model_name = args.model_name
+    # Load tasks
+    if args.test_json:
+        if rank == 0:
+            print(f"Loading tasks from JSON: {args.test_json}")
+        with open(args.test_json, 'r', encoding='utf-8') as f:
+            eval_prompts_list = json.load(f)
+        eval_prompts = {}
+        for item in eval_prompts_list:
+            # Assume item has structure compatible or use fallback logic
+            # Here we expect task_type/sample_id to uniquely identify, or we create a name
+            if 'task_type' in item and 'sample_id' in item:
+                fname = f"{item['task_type']}_{item['sample_id']}.mp4"
+            else:
+                # Fallback naming if JSON structure is different
+                fname = f"sample_{len(eval_prompts)}.mp4"
+            eval_prompts[fname] = item
+        items = list(eval_prompts.items())
+    elif args.video_path and args.prompt:
+        if rank == 0:
+            print(f"Running in single video mode: {args.video_path}")
+        fname = os.path.basename(args.video_path)
+        item = {
+            "source_video_path": args.video_path,
+            "edit_instruction": args.prompt
+        }
+        items = [(fname, item)]
+    else:
+        raise ValueError("Must provide either --test_json or both --video_path and --prompt")
+    # Filter done
+    pending_items = []
+    for fname, item in items:
+        base = os.path.splitext(fname)[0]
+        output_video_path = os.path.join(args.output_dir, f"gen_{base}.mp4")
+        if not os.path.exists(output_video_path):
+            pending_items.append((fname, item))
+    if rank == 0:
+        print(f"Total items: {len(items)}, already generated: {len(items) - len(pending_items)}, pending: {len(pending_items)}")
+    # Shard across GPUs
+    subset_items = pending_items[rank::world_size] if world_size > 0 else pending_items
+    print(f"[GPU {rank} | local {local_rank}] Processing {len(subset_items)} items")
+    device = torch.device(f"cuda:{local_rank}")
+    # Load config and models
+    config = OmegaConf.load(config_path)
+    transformer = WanTransformer3DModel.from_pretrained(
+        os.path.join(model_name, config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
+        transformer_additional_kwargs=OmegaConf.to_container(config['transformer_additional_kwargs']),
+        low_cpu_mem_usage=True,
+        torch_dtype=weight_dtype,
+    )
+    if transformer_path is not None:
+        print(f"[GPU {rank}] Loading transformer from checkpoint: {transformer_path}")
+        if transformer_path.endswith("safetensors"):
+            from safetensors.torch import load_file
+            state_dict = load_file(transformer_path)
+        else:
+            state_dict = torch.load(transformer_path, map_location="cpu")
+        state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+        m, u = transformer.load_state_dict(state_dict, strict=False)
+        print(f"[GPU {rank}] Missing keys: {len(m)}, unexpected keys: {len(u)}")
+    vae = AutoencoderKLWan.from_pretrained(
+        os.path.join(model_name, config['vae_kwargs'].get('vae_subpath', 'vae')),
+        additional_kwargs=OmegaConf.to_container(config['vae_kwargs']),
+    ).to(weight_dtype)
+    if vae_path is not None:
+        print(f"[GPU {rank}] Loading VAE from checkpoint: {vae_path}")
+        if vae_path.endswith("safetensors"):
+            from safetensors.torch import load_file
+            state_dict = load_file(vae_path)
+        else:
+            state_dict = torch.load(vae_path, map_location="cpu")
+        state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+        m, u = vae.load_state_dict(state_dict, strict=False)
+        print(f"[GPU {rank}] Missing keys: {len(m)}, unexpected keys: {len(u)}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        os.path.join(model_name, config['text_encoder_kwargs'].get('tokenizer_subpath', 'tokenizer')),
+    )
+    text_encoder = WanT5EncoderModel.from_pretrained(
+        os.path.join(model_name, config['text_encoder_kwargs'].get('text_encoder_subpath', 'text_encoder')),
+        additional_kwargs=OmegaConf.to_container(config['text_encoder_kwargs']),
+        low_cpu_mem_usage=True,
+        torch_dtype=weight_dtype,
+    )
+    Choosen_Scheduler = {
+        "Flow": FlowMatchEulerDiscreteScheduler,
+        "Flow_Unipc": FlowUniPCMultistepScheduler,
+        "Flow_DPM++": FlowDPMSolverMultistepScheduler,
+    }[sampler_name]
+    if sampler_name in ["Flow_Unipc", "Flow_DPM++"]:
+        config['scheduler_kwargs']['shift'] = 1
+    scheduler = Choosen_Scheduler(
+        **filter_kwargs(Choosen_Scheduler, OmegaConf.to_container(config['scheduler_kwargs']))
+    )
+    pipeline = WanPipeline(
+        transformer=transformer,
+        vae=vae,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        scheduler=scheduler,
+    )
+    # Memory mode
+    if GPU_memory_mode == "sequential_cpu_offload":
+        replace_parameters_by_name(transformer, ["modulation",], device=device)
+        transformer.freqs = transformer.freqs.to(device=device)
+        pipeline.enable_sequential_cpu_offload(device=device)
+    elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+        convert_model_weight_to_float8(transformer, exclude_module_name=["modulation",], device=device)
+        convert_weight_dtype_wrapper(transformer, weight_dtype)
+        pipeline.enable_model_cpu_offload(device=device)
+    elif GPU_memory_mode == "model_cpu_offload":
+        pipeline.enable_model_cpu_offload(device=device)
+    elif GPU_memory_mode == "model_full_load_and_qfloat8":
+        convert_model_weight_to_float8(transformer, exclude_module_name=["modulation",], device=device)
+        convert_weight_dtype_wrapper(transformer, weight_dtype)
+        pipeline.to(device=device)
+    else:
+        pipeline.to(device=device)
+    # LoRA
+    if args.videocof_path is not None:
+        pipeline = merge_lora(pipeline, args.videocof_path, lora_weight, device=device)
+        print(f"[GPU {rank}] Loaded LoRA from {args.videocof_path}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    generator = torch.Generator(device=device).manual_seed(args.seed + rank)
+    # Grounding indices are now handled inside the pipeline; no forward override needed.
+    for fname, item in subset_items:
+        base = os.path.splitext(fname)[0]
+        output_video_path = os.path.join(args.output_dir, f"gen_{base}.mp4")
+        info_path = os.path.join(args.output_dir, f"gen_{base}_info.txt")
+        print(f"[GPU {rank}] Processing {fname}...")
+        video_path = item["source_video_path"]
+        # Match training dataset (ImageVideoCoTDataset) prompt formatting
+        edit_text = item.get('text', item.get('qwen_vl_72b_refined_instruction', item.get('edit_instruction', '')))
+        ground_instr = derive_ground_instruction(edit_text)
+        prompt = (
+            "A video sequence showing three parts: first the original scene, "
+            f"then grounded {ground_instr}, and finally the same scene but {edit_text}"
+        )
+        input_video, video_height, video_width = load_video_frames(
+            video_path,
+            source_frames=args.source_frames,
+        )
+        with torch.no_grad():
+            sample = pipeline(
+                video=input_video,
+                prompt=prompt,
+                num_frames=args.num_frames,
+                source_frames=args.source_frames,
+                reasoning_frames=args.reasoning_frames,
+                negative_prompt=negative_prompt,
+                height=video_height,
+                width=video_width,
+                generator=generator,
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_inference_steps,
+                shift=shift,
+                repeat_rope=args.repeat_rope,
+                cot=True,
+            ).videos
+        reason_edit_path = os.path.join(args.output_dir, f"gen_{base}_reason_edit.mp4")
+        save_results(sample, reason_edit_path, fps)
+        print(f"[GPU {rank}] Saved reason+edit video shape: {sample.shape}")
+        edit_video = sample[:, :, -args.source_frames:, :, :]
+        save_results(edit_video, output_video_path, fps)
+        print(f"[GPU {rank}] Edit video shape: {edit_video.shape}")
+        compare_path = os.path.join(args.output_dir, f"gen_{base}_compare.mp4")
+        save_side_by_side(input_video, edit_video, compare_path, fps)
+        with open(info_path, "w", encoding="utf-8") as info_f:
+            info_f.write(prompt)
+        print(f"[GPU {rank}] Completed {fname}")
+    if args.videocof_path is not None:
+        pipeline = unmerge_lora(pipeline, args.videocof_path, lora_weight, device=device)
+    print(f"[GPU {rank}] Finished processing all assigned items")
+if __name__ == "__main__":
+    main()

install.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import sys
+import subprocess
+import locale
+import threading
+import os
+def handle_stream(stream, prefix):
+    stream.reconfigure(encoding=locale.getpreferredencoding(), errors='replace')
+    for msg in stream:
+        if prefix == '[!]' and ('it/s]' in msg or 's/it]' in msg) and ('%|' in msg or 'it [' in msg):
+            if msg.startswith('100%'):
+                print('\r' + msg, end="", file=sys.stderr),
+            else:
+                print('\r' + msg[:-1], end="", file=sys.stderr),
+        else:
+            if prefix == '[!]':
+                print(prefix, msg, end="", file=sys.stderr)
+            else:
+                print(prefix, msg, end="")
+def process_wrap(cmd_str, cwd_path, handler=None):
+    process = subprocess.Popen(cmd_str, cwd=cwd_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
+    if handler is None:
+        handler = handle_stream
+    stdout_thread = threading.Thread(target=handler, args=(process.stdout, ""))
+    stderr_thread = threading.Thread(target=handler, args=(process.stderr, "[!]"))
+    stdout_thread.start()
+    stderr_thread.start()
+    stdout_thread.join()
+    stderr_thread.join()
+    return process.wait()
+assert process_wrap([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"], cwd_path=os.path.dirname(os.path.realpath(__file__))) == 0, "ERROR: Failed to install requirements.txt. Please install them manually, and restart ComfyUI."
+nodep_packages = [
+    "kornia>=0.6.9",
+    "xformers>=0.0.20",
+]
+assert process_wrap([sys.executable, "-m", "pip", "install", "--no-deps", *nodep_packages], cwd_path=os.path.dirname(os.path.realpath(__file__))) == 0, "ERROR: Failed to install last set of packages. Please install them manually, and restart ComfyUI."

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[project]
+name = "videox-fun"
+description = "VideoX-Fun is a video generation pipeline that can be used to generate AI images and videos, as well as to train baseline and Lora models for Diffusion Transformer. We support direct prediction from pre-trained baseline models to generate videos with different resolutions, durations, and FPS. Additionally, we also support users in training their own baseline and Lora models to perform specific style transformations."
+version = "1.0.0"
+license = {file = "LICENSE"}
+dependencies = ["Pillow", "einops", "safetensors", "timm", "tomesd", "torch>=2.1.2", "torchdiffeq", "torchsde", "decord", "datasets", "numpy", "scikit-image", "opencv-python", "omegaconf", "SentencePiece", "albumentations", "imageio[ffmpeg]", "imageio[pyav]", "tensorboard", "beautifulsoup4", "ftfy", "func_timeout", "accelerate>=0.25.0", "gradio>=3.41.2,<=3.48.0", "diffusers>=0.30.1,<=0.31.0", "transformers>=4.46.2"]
+[project.urls]
+Repository = "https://github.com/aigc-apps/VideoX-Fun"
+#  Used by Comfy Registry https://comfyregistry.org
+[tool.comfy]
+PublisherId = "bubbliiiing"
+DisplayName = "VideoX-Fun"
+Icon = ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+Pillow
+einops
+safetensors
+timm
+tomesd
+torchdiffeq
+torchsde
+decord
+datasets
+numpy
+scikit-image
+opencv-python
+omegaconf
+SentencePiece
+albumentations
+imageio[ffmpeg]
+imageio[pyav]
+tensorboard
+beautifulsoup4
+ftfy
+func_timeout
+onnxruntime
+accelerate>=0.25.0
+gradio>=3.41.2
+diffusers>=0.30.1
+transformers>=4.46.2

scripts/local_style.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+export CUDA_VISIBLE_DEVICES=2
+torchrun --nproc_per_node=1 inference.py \
+  --video_path assets/sign.mp4 \
+  --prompt "Replace the yellow \"SCHOOL\" sign with a red hospital sign, featuring a white hospital emblem on the top and the word \"HOSPITAL\" below." \
+  --output_dir results/local_style \
+  --model_name /scratch3/yan204/models/Wan2.1-T2V-14B \
+  --seed 0 \
+  --num_frames 33 \
+  --source_frames 33 \
+  --reasoning_frames 4 \
+  --repeat_rope \
+  --videocof_path videocof_weight/videocof.safetensors

scripts/obj_add.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+export CUDA_VISIBLE_DEVICES=0
+torchrun --nproc_per_node=1 inference.py \
+  --video_path assets/woman_ballon.mp4 \
+  --prompt "Add the woman in a floral dress pointing at the balloon on the left." \
+  --output_dir results/obj_add \
+  --model_name /scratch3/yan204/models/Wan2.1-T2V-14B \
+  --seed 0 \
+  --num_frames 33 \
+  --source_frames 33 \
+  --reasoning_frames 4 \
+  --repeat_rope \
+  --videocof_path videocof_weight/videocof.safetensors

scripts/obj_rem.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+export CUDA_VISIBLE_DEVICES=1
+torchrun --nproc_per_node=1 inference.py \
+  --video_path assets/two_man.mp4 \
+  --prompt "Remove the young man with short black hair wearing black shirt on the left." \
+  --output_dir results/obj_rem \
+  --model_name /scratch3/yan204/models/Wan2.1-T2V-14B \
+  --seed 0 \
+  --num_frames 33 \
+  --source_frames 33 \
+  --reasoning_frames 4 \
+  --repeat_rope \
+  --videocof_path videocof_weight/videocof.safetensors

scripts/parallel_infer.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+export CUDA_VISIBLE_DEVICES=0,1,2,3
+torchrun --nproc_per_node=4 inference.py \
+  --test_json assets/teaser_test.json \
+  --output_dir results/torch_2.5.1 \
+  --model_name /scratch3/yan204/models/Wan2.1-T2V-14B \
+  --seed 0 \
+  --num_frames 33 \
+  --source_frames 33 \
+  --reasoning_frames 4 \
+  --repeat_rope \
+  --videocof_path videocof_weight/videocof.safetensors

videox_fun/__init__.py ADDED Viewed

File without changes

videox_fun/api/api.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import base64
+import gc
+import hashlib
+import io
+import os
+import tempfile
+from io import BytesIO
+import gradio as gr
+import requests
+import torch
+from fastapi import FastAPI
+from PIL import Image
+# Function to encode a file to Base64
+def encode_file_to_base64(file_path):
+    with open(file_path, "rb") as file:
+        # Encode the data to Base64
+        file_base64 = base64.b64encode(file.read())
+        return file_base64
+def update_diffusion_transformer_api(_: gr.Blocks, app: FastAPI, controller):
+    @app.post("/videox_fun/update_diffusion_transformer")
+    def _update_diffusion_transformer_api(
+        datas: dict,
+    ):
+        diffusion_transformer_path = datas.get('diffusion_transformer_path', 'none')
+        try:
+            controller.update_diffusion_transformer(
+                diffusion_transformer_path
+            )
+            comment = "Success"
+        except Exception as e:
+            torch.cuda.empty_cache()
+            comment = f"Error. error information is {str(e)}"
+        return {"message": comment}
+def download_from_url(url, timeout=10):
+    try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()  # 检查请求是否成功
+        return response.content
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading from {url}: {e}")
+        return None
+def save_base64_video(base64_string):
+    video_data = base64.b64decode(base64_string)
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.mp4"
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+    with open(file_path, 'wb') as video_file:
+        video_file.write(video_data)
+    return file_path
+def save_base64_image(base64_string):
+    video_data = base64.b64decode(base64_string)
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.jpg"
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+    with open(file_path, 'wb') as video_file:
+        video_file.write(video_data)
+    return file_path
+def save_url_video(url):
+    video_data = download_from_url(url)
+    if video_data:
+        return save_base64_video(base64.b64encode(video_data))
+    return None
+def save_url_image(url):
+    image_data = download_from_url(url)
+    if image_data:
+        return save_base64_image(base64.b64encode(image_data))
+    return None
+def infer_forward_api(_: gr.Blocks, app: FastAPI, controller):
+    @app.post("/videox_fun/infer_forward")
+    def _infer_forward_api(
+        datas: dict,
+    ):
+        base_model_path = datas.get('base_model_path', 'none')
+        base_model_2_path = datas.get('base_model_2_path', 'none')
+        lora_model_path = datas.get('lora_model_path', 'none')
+        lora_model_2_path = datas.get('lora_model_2_path', 'none')
+        lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
+        prompt_textbox = datas.get('prompt_textbox', None)
+        negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
+        sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
+        sample_step_slider = datas.get('sample_step_slider', 30)
+        resize_method = datas.get('resize_method', "Generate by")
+        width_slider = datas.get('width_slider', 672)
+        height_slider = datas.get('height_slider', 384)
+        base_resolution = datas.get('base_resolution', 512)
+        is_image = datas.get('is_image', False)
+        generation_method = datas.get('generation_method', False)
+        length_slider = datas.get('length_slider', 49)
+        overlap_video_length = datas.get('overlap_video_length', 4)
+        partial_video_length = datas.get('partial_video_length', 72)
+        cfg_scale_slider = datas.get('cfg_scale_slider', 6)
+        start_image = datas.get('start_image', None)
+        end_image = datas.get('end_image', None)
+        validation_video = datas.get('validation_video', None)
+        validation_video_mask = datas.get('validation_video_mask', None)
+        control_video = datas.get('control_video', None)
+        denoise_strength = datas.get('denoise_strength', 0.70)
+        seed_textbox = datas.get("seed_textbox", 43)
+        ref_image = datas.get('ref_image', None)
+        enable_teacache = datas.get('enable_teacache', True)
+        teacache_threshold = datas.get('teacache_threshold', 0.10)
+        num_skip_start_steps = datas.get('num_skip_start_steps', 1)
+        teacache_offload = datas.get('teacache_offload', False)
+        cfg_skip_ratio = datas.get('cfg_skip_ratio', 0)
+        enable_riflex = datas.get('enable_riflex', False)
+        riflex_k = datas.get('riflex_k', 6)
+        fps = datas.get('fps', None)
+        generation_method = "Image Generation" if is_image else generation_method
+        if start_image is not None:
+            if start_image.startswith('http'):
+                start_image = save_url_image(start_image)
+                start_image = [Image.open(start_image).convert("RGB")]
+            else:
+                start_image = base64.b64decode(start_image)
+                start_image = [Image.open(BytesIO(start_image)).convert("RGB")]
+        if end_image is not None:
+            if end_image.startswith('http'):
+                end_image = save_url_image(end_image)
+                end_image = [Image.open(end_image).convert("RGB")]
+            else:
+                end_image = base64.b64decode(end_image)
+                end_image = [Image.open(BytesIO(end_image)).convert("RGB")]
+        if validation_video is not None:
+            if validation_video.startswith('http'):
+                validation_video = save_url_video(validation_video)
+            else:
+                validation_video = save_base64_video(validation_video)
+        if validation_video_mask is not None:
+            if validation_video_mask.startswith('http'):
+                validation_video_mask = save_url_image(validation_video_mask)
+            else:
+                validation_video_mask = save_base64_image(validation_video_mask)
+        if control_video is not None:
+            if control_video.startswith('http'):
+                control_video = save_url_video(control_video)
+            else:
+                control_video = save_base64_video(control_video)
+        if ref_image is not None:
+            if ref_image.startswith('http'):
+                ref_image = save_url_image(ref_image)
+                ref_image = [Image.open(ref_image).convert("RGB")]
+            else:
+                ref_image = base64.b64decode(ref_image)
+                ref_image = [Image.open(BytesIO(ref_image)).convert("RGB")]
+        try:
+            save_sample_path, comment = controller.generate(
+                "",
+                base_model_path,
+                lora_model_path,
+                lora_alpha_slider,
+                prompt_textbox,
+                negative_prompt_textbox,
+                sampler_dropdown,
+                sample_step_slider,
+                resize_method,
+                width_slider,
+                height_slider,
+                base_resolution,
+                generation_method,
+                length_slider,
+                overlap_video_length,
+                partial_video_length,
+                cfg_scale_slider,
+                start_image,
+                end_image,
+                validation_video,
+                validation_video_mask,
+                control_video,
+                denoise_strength,
+                seed_textbox,
+                ref_image = ref_image,
+                enable_teacache = enable_teacache,
+                teacache_threshold = teacache_threshold,
+                num_skip_start_steps = num_skip_start_steps,
+                teacache_offload = teacache_offload,
+                cfg_skip_ratio = cfg_skip_ratio,
+                enable_riflex = enable_riflex,
+                riflex_k = riflex_k,
+                base_model_2_dropdown = base_model_2_path,
+                lora_model_2_dropdown = lora_model_2_path,
+                fps = fps,
+                is_api = True,
+            )
+        except Exception as e:
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+            save_sample_path = ""
+            comment = f"Error. error information is {str(e)}"
+            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+        if save_sample_path != "":
+            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+        else:
+            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": None}

videox_fun/api/api_multi_nodes.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# This file is modified from https://github.com/xdit-project/xDiT/blob/main/entrypoints/launch.py
+import base64
+import gc
+import hashlib
+import io
+import os
+import tempfile
+from io import BytesIO
+import gradio as gr
+import requests
+import torch
+import torch.distributed as dist
+from fastapi import FastAPI, HTTPException
+from PIL import Image
+from .api import download_from_url, encode_file_to_base64
+try:
+    import ray
+except:
+    print("Ray is not installed. If you want to use multi gpus api. Please install it by running 'pip install ray'.")
+    ray =  None
+def save_base64_video_dist(base64_string):
+    video_data = base64.b64decode(base64_string)
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.mp4"
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            with open(file_path, 'wb') as video_file:
+                video_file.write(video_data)
+        dist.barrier()
+    else:
+        with open(file_path, 'wb') as video_file:
+            video_file.write(video_data)
+    return file_path
+def save_base64_image_dist(base64_string):
+    video_data = base64.b64decode(base64_string)
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.jpg"
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+    if dist.is_initialized():
+        if dist.get_rank() == 0:
+            with open(file_path, 'wb') as video_file:
+                video_file.write(video_data)
+        dist.barrier()
+    else:
+        with open(file_path, 'wb') as video_file:
+            video_file.write(video_data)
+    return file_path
+def save_url_video_dist(url):
+    video_data = download_from_url(url)
+    if video_data:
+        return save_base64_video_dist(base64.b64encode(video_data))
+    return None
+def save_url_image_dist(url):
+    image_data = download_from_url(url)
+    if image_data:
+        return save_base64_image_dist(base64.b64encode(image_data))
+    return None
+if ray is not None:
+    @ray.remote(num_gpus=1)
+    class MultiNodesGenerator:
+        def __init__(
+            self, rank: int, world_size: int, Controller,
+            GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
+            config_path=None, ulysses_degree=1, ring_degree=1,
+            fsdp_dit=False, fsdp_text_encoder=False, compile_dit=False,
+            weight_dtype=None, savedir_sample=None,
+        ):
+            # Set PyTorch distributed environment variables
+            os.environ["RANK"] = str(rank)
+            os.environ["WORLD_SIZE"] = str(world_size)
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+            os.environ["MASTER_PORT"] = "29500"
+            self.rank = rank
+            self.controller = Controller(
+                GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path,
+                ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+                fsdp_dit=fsdp_dit, fsdp_text_encoder=fsdp_text_encoder, compile_dit=compile_dit,
+                weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+            )
+        def generate(self, datas):
+            try:
+                base_model_path = datas.get('base_model_path', 'none')
+                base_model_2_path = datas.get('base_model_2_path', 'none')
+                lora_model_path = datas.get('lora_model_path', 'none')
+                lora_model_2_path = datas.get('lora_model_2_path', 'none')
+                lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
+                prompt_textbox = datas.get('prompt_textbox', None)
+                negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
+                sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
+                sample_step_slider = datas.get('sample_step_slider', 30)
+                resize_method = datas.get('resize_method', "Generate by")
+                width_slider = datas.get('width_slider', 672)
+                height_slider = datas.get('height_slider', 384)
+                base_resolution = datas.get('base_resolution', 512)
+                is_image = datas.get('is_image', False)
+                generation_method = datas.get('generation_method', False)
+                length_slider = datas.get('length_slider', 49)
+                overlap_video_length = datas.get('overlap_video_length', 4)
+                partial_video_length = datas.get('partial_video_length', 72)
+                cfg_scale_slider = datas.get('cfg_scale_slider', 6)
+                start_image = datas.get('start_image', None)
+                end_image = datas.get('end_image', None)
+                validation_video = datas.get('validation_video', None)
+                validation_video_mask = datas.get('validation_video_mask', None)
+                control_video = datas.get('control_video', None)
+                denoise_strength = datas.get('denoise_strength', 0.70)
+                seed_textbox = datas.get("seed_textbox", 43)
+                ref_image = datas.get('ref_image', None)
+                enable_teacache = datas.get('enable_teacache', True)
+                teacache_threshold = datas.get('teacache_threshold', 0.10)
+                num_skip_start_steps = datas.get('num_skip_start_steps', 1)
+                teacache_offload = datas.get('teacache_offload', False)
+                cfg_skip_ratio = datas.get('cfg_skip_ratio', 0)
+                enable_riflex = datas.get('enable_riflex', False)
+                riflex_k = datas.get('riflex_k', 6)
+                fps = datas.get('fps', None)
+                generation_method = "Image Generation" if is_image else generation_method
+                if start_image is not None:
+                    if start_image.startswith('http'):
+                        start_image = save_url_image_dist(start_image)
+                        start_image = [Image.open(start_image).convert("RGB")]
+                    else:
+                        start_image = base64.b64decode(start_image)
+                        start_image = [Image.open(BytesIO(start_image)).convert("RGB")]
+                if end_image is not None:
+                    if end_image.startswith('http'):
+                        end_image = save_url_image_dist(end_image)
+                        end_image = [Image.open(end_image).convert("RGB")]
+                    else:
+                        end_image = base64.b64decode(end_image)
+                        end_image = [Image.open(BytesIO(end_image)).convert("RGB")]
+                if validation_video is not None:
+                    if validation_video.startswith('http'):
+                        validation_video = save_url_video_dist(validation_video)
+                    else:
+                        validation_video = save_base64_video_dist(validation_video)
+                if validation_video_mask is not None:
+                    if validation_video_mask.startswith('http'):
+                        validation_video_mask = save_url_image_dist(validation_video_mask)
+                    else:
+                        validation_video_mask = save_base64_image_dist(validation_video_mask)
+                if control_video is not None:
+                    if control_video.startswith('http'):
+                        control_video = save_url_video_dist(control_video)
+                    else:
+                        control_video = save_base64_video_dist(control_video)
+                if ref_image is not None:
+                    if ref_image.startswith('http'):
+                        ref_image = save_url_image_dist(ref_image)
+                        ref_image = [Image.open(ref_image).convert("RGB")]
+                    else:
+                        ref_image = base64.b64decode(ref_image)
+                        ref_image = [Image.open(BytesIO(ref_image)).convert("RGB")]
+                try:
+                    save_sample_path, comment = self.controller.generate(
+                        "",
+                        base_model_path,
+                        lora_model_path,
+                        lora_alpha_slider,
+                        prompt_textbox,
+                        negative_prompt_textbox,
+                        sampler_dropdown,
+                        sample_step_slider,
+                        resize_method,
+                        width_slider,
+                        height_slider,
+                        base_resolution,
+                        generation_method,
+                        length_slider,
+                        overlap_video_length,
+                        partial_video_length,
+                        cfg_scale_slider,
+                        start_image,
+                        end_image,
+                        validation_video,
+                        validation_video_mask,
+                        control_video,
+                        denoise_strength,
+                        seed_textbox,
+                        ref_image = ref_image,
+                        enable_teacache = enable_teacache,
+                        teacache_threshold = teacache_threshold,
+                        num_skip_start_steps = num_skip_start_steps,
+                        teacache_offload = teacache_offload,
+                        cfg_skip_ratio = cfg_skip_ratio,
+                        enable_riflex = enable_riflex,
+                        riflex_k = riflex_k,
+                        base_model_2_dropdown = base_model_2_path,
+                        lora_model_2_dropdown = lora_model_2_path,
+                        fps = fps,
+                        is_api = True,
+                    )
+                except Exception as e:
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                    save_sample_path = ""
+                    comment = f"Error. error information is {str(e)}"
+                    if dist.is_initialized():
+                        if dist.get_rank() == 0:
+                            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+                        else:
+                            return None
+                    else:
+                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+                if dist.is_initialized():
+                    if dist.get_rank() == 0:
+                        if save_sample_path != "":
+                            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+                        else:
+                            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+                    else:
+                        return None
+                else:
+                    if save_sample_path != "":
+                        return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+                    else:
+                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+            except Exception as e:
+                print(f"Error generating: {str(e)}")
+                comment = f"Error generating: {str(e)}"
+                if dist.is_initialized():
+                    if dist.get_rank() == 0:
+                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+                    else:
+                        return None
+                else:
+                    return {"message": comment, "save_sample_path": None, "base64_encoding": None}
+    class MultiNodesEngine:
+        def __init__(
+            self,
+            world_size,
+            Controller,
+            GPU_memory_mode,
+            scheduler_dict,
+            model_name,
+            model_type,
+            config_path,
+            ulysses_degree=1,
+            ring_degree=1,
+            fsdp_dit=False,
+            fsdp_text_encoder=False,
+            compile_dit=False,
+            weight_dtype=torch.bfloat16,
+            savedir_sample="samples"
+        ):
+            # Ensure Ray is initialized
+            if not ray.is_initialized():
+                ray.init()
+            num_workers = world_size
+            self.workers = [
+                MultiNodesGenerator.remote(
+                    rank, world_size, Controller,
+                    GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path,
+                    ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+                    fsdp_dit=fsdp_dit, fsdp_text_encoder=fsdp_text_encoder, compile_dit=compile_dit,
+                    weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+                )
+                for rank in range(num_workers)
+            ]
+            print("Update workers done")
+        async def generate(self, data):
+            results = ray.get([
+                worker.generate.remote(data)
+                for worker in self.workers
+            ])
+            return next(path for path in results if path is not None)
+    def multi_nodes_infer_forward_api(_: gr.Blocks, app: FastAPI, engine):
+        @app.post("/videox_fun/infer_forward")
+        async def _multi_nodes_infer_forward_api(
+            datas: dict,
+        ):
+            try:
+                result = await engine.generate(datas)
+                return result
+            except Exception as e:
+                if isinstance(e, HTTPException):
+                    raise e
+                raise HTTPException(status_code=500, detail=str(e))
+else:
+    MultiNodesEngine = None
+    MultiNodesGenerator = None
+    multi_nodes_infer_forward_api = None

videox_fun/data/bucket_sampler.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import (Generic, Iterable, Iterator, List, Optional, Sequence,
+                    Sized, TypeVar, Union)
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import BatchSampler, Dataset, Sampler
+# Original presets (commented out per request):
+CUSTOM_ASPECT_RATIOS = {
+    "0.5676": [336, 592],  # count=133984
+    "1.7619": [592, 336],  # count=78813
+    "0.5682": [400, 704],  # count=4421
+    "0.5556": [320, 576],  # count=2481
+    "1.7600": [704, 400],  # count=1682
+    "0.5319": [400, 752],  # count=1235
+    "1.8000": [576, 320],  # count=924
+    "0.5128": [320, 624],  # count=711
+    "1.8800": [752, 400],  # count=400
+    "1.9000": [608, 320],  # count=226
+    "0.4237": [400, 944],  # count=29
+}
+# CUSTOM_ASPECT_RATIOS = {
+#     "0.5676": [336, 592],  # 336x592 (h x w)
+#     "1.7619": [592, 336],  # 592x336
+#     "0.5682": [400, 704],  # 400x704
+#     "1.7600": [704, 400],  # 704x400
+#     "0.5319": [400, 752],  # 400x752
+#     "1.8800": [752, 400],  # 752x400
+#     "0.4237": [400, 944],  # 400x944
+# }
+ASPECT_RATIO_512 = {
+    '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
+    '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
+    '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
+    '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
+    '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
+    '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
+    '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
+    '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
+    '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
+    '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
+}
+ASPECT_RATIO_RANDOM_CROP_512 = {
+    '0.42': [320.0, 768.0], '0.5': [352.0, 704.0],
+    '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0],
+    '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0],
+    '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0],
+    '2.0': [704.0, 352.0],  '2.4': [768.0, 320.0]
+}
+ASPECT_RATIO_RANDOM_CROP_PROB = [
+    1, 2,
+    4, 4, 4, 4,
+    8, 8, 8,
+    4, 4, 4, 4,
+    2, 1
+]
+ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
+def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
+    aspect_ratio = height / width
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return ratios[closest_ratio], float(closest_ratio)
+def get_image_size_without_loading(path):
+    with Image.open(path) as img:
+        return img.size  # (width, height)
+class RandomSampler(Sampler[int]):
+    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`.
+        generator (Generator): Generator used in sampling.
+    """
+    data_source: Sized
+    replacement: bool
+    def __init__(self, data_source: Sized, replacement: bool = False,
+                 num_samples: Optional[int] = None, generator=None) -> None:
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+        self._pos_start = 0
+        if not isinstance(self.replacement, bool):
+            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
+    @property
+    def num_samples(self) -> int:
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+    def __iter__(self) -> Iterator[int]:
+        n = len(self.data_source)
+        if self.generator is None:
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = self.generator
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
+            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
+        else:
+            for _ in range(self.num_samples // n):
+                xx = torch.randperm(n, generator=generator).tolist()
+                if self._pos_start >= n:
+                    self._pos_start = 0
+                for idx in range(self._pos_start, n):
+                    yield xx[idx]
+                    self._pos_start = (self._pos_start + 1) % n
+                self._pos_start = 0
+            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
+    def __len__(self) -> int:
+        return self.num_samples
+class AspectRatioBatchImageSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        train_folder: str = None,
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios]
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                image_dict = self.dataset[idx]
+                width, height = image_dict.get("width", None), image_dict.get("height", None)
+                if width is None or height is None:
+                    image_id, name = image_dict['file_path'], image_dict['text']
+                    if self.train_folder is None:
+                        image_dir = image_id
+                    else:
+                        image_dir = os.path.join(self.train_folder, image_id)
+                    width, height = get_image_size_without_loading(image_dir)
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e)
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        video_folder: str = None,
+        train_data_format: str = "webvid",
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.video_folder = video_folder
+        self.train_data_format = train_data_format
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios]
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                video_dict = self.dataset[idx]
+                width, more = video_dict.get("width", None), video_dict.get("height", None)
+                if width is None or height is None:
+                    if self.train_data_format == "normal":
+                        video_id, name = video_dict['file_path'], video_dict['text']
+                        if self.video_folder is None:
+                            video_dir = video_id
+                        else:
+                            video_dir = os.path.join(self.video_folder, video_id)
+                    else:
+                        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+                        video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
+                    cap = cv2.VideoCapture(video_dir)
+                    # 获取视频尺寸
+                    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
+                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e, self.dataset[idx], "This item is error, please check it.")
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+class AspectRatioBatchImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 train_folder: str = None,
+                 aspect_ratios: dict = ASPECT_RATIO_512,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        # buckets for each aspect ratio
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+        self.bucket = {
+            'image':{ratio: [] for ratio in aspect_ratios},
+            'video':{ratio: [] for ratio in aspect_ratios}
+        }
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset[idx].get('type', 'video')  # Default to video for video edit datasets
+            try:
+                data_dict = self.dataset[idx]
+                width, height = data_dict.get("width", None), data_dict.get("height", None)
+                if width is None or height is None:
+                    if content_type == 'image':
+                        # Image branch
+                        image_id = data_dict.get('file_path', '')
+                        if self.train_folder is None:
+                            image_dir = image_id
+                        else:
+                            image_dir = os.path.join(self.train_folder, image_id)
+                        width, height = get_image_size_without_loading(image_dir)
+                    else:
+                        # Video branch - prefer original_video -> edited_video -> file_path
+                        video_id = (
+                            data_dict.get('original_video')
+                            or data_dict.get('edited_video')
+                            or data_dict.get('file_path')
+                        )
+                        if video_id is None:
+                            raise ValueError(f"No valid video path found in dataset item: {data_dict}")
+                        if self.train_folder is None:
+                            video_dir = video_id
+                        else:
+                            video_dir = os.path.join(self.train_folder, video_id)
+                        cap = cv2.VideoCapture(video_dir)
+                        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                        cap.release()
+                        if width == 0 or height == 0:
+                            raise ValueError(f"Invalid video size for {video_dir}: {width}x{height}")
+                else:
+                    height = int(height)
+                    width = int(width)
+                ratio = height / width
+            except Exception as e:
+                print(e, self.dataset[idx], "This item is error, please check it.")
+                continue
+            # Find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            # Add to appropriate bucket (image or video)
+            bucket = self.bucket[content_type][closest_ratio]
+            bucket.append(idx)
+            # Yield a batch when bucket is full (ensures all items are same type)
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]

videox_fun/data/dataset_image.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import json
+import os
+import random
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from torch.utils.data.dataset import Dataset
+class CC15M(Dataset):
+    def __init__(
+            self,
+            json_path,
+            video_folder=None,
+            resolution=512,
+            enable_bucket=False,
+        ):
+        print(f"loading annotations from {json_path} ...")
+        self.dataset = json.load(open(json_path, 'r'))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        self.enable_bucket = enable_bucket
+        self.video_folder = video_folder
+        resolution = tuple(resolution) if not isinstance(resolution, int) else (resolution, resolution)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize(resolution[0]),
+            transforms.CenterCrop(resolution),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_id, name = video_dict['file_path'], video_dict['text']
+        if self.video_folder is None:
+            video_dir = video_id
+        else:
+            video_dir = os.path.join(self.video_folder, video_id)
+        pixel_values = Image.open(video_dir).convert("RGB")
+        return pixel_values, name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                print(e)
+                idx = random.randint(0, self.length-1)
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        else:
+            pixel_values = np.array(pixel_values)
+        sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+if __name__ == "__main__":
+    dataset = CC15M(
+        csv_path="/mnt_wg/zhoumo.xjq/CCUtils/cc15m_add_index.json",
+        resolution=512,
+    )
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
+    for idx, batch in enumerate(dataloader):
+        print(batch["pixel_values"].shape, len(batch["text"]))

videox_fun/data/dataset_image_video.py ADDED Viewed

	@@ -0,0 +1,1939 @@

+import csv
+import gc
+import io
+import json
+import math
+import os
+import random
+import re
+from contextlib import contextmanager
+from random import shuffle
+from threading import Thread
+import albumentations
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from decord import VideoReader
+from einops import rearrange
+from func_timeout import FunctionTimedOut, func_timeout
+from packaging import version as pver
+from PIL import Image
+from torch.utils.data import BatchSampler, Sampler
+from torch.utils.data.dataset import Dataset
+VIDEO_READER_TIMEOUT = 20
+def get_random_mask(shape, image_start_only=False):
+    f, c, h, w = shape
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+    if not image_start_only:
+        if f != 1:
+            mask_index = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], p=[0.05, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05])
+        else:
+            mask_index = np.random.choice([0, 1], p = [0.2, 0.8])
+        if mask_index == 0:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+            start_x = max(center_x - block_size_x // 2, 0)
+            end_x = min(center_x + block_size_x // 2, w)
+            start_y = max(center_y - block_size_y // 2, 0)
+            end_y = min(center_y + block_size_y // 2, h)
+            mask[:, :, start_y:end_y, start_x:end_x] = 1
+        elif mask_index == 1:
+            mask[:, :, :, :] = 1
+        elif mask_index == 2:
+            mask_frame_index = np.random.randint(1, 5)
+            mask[mask_frame_index:, :, :, :] = 1
+        elif mask_index == 3:
+            mask_frame_index = np.random.randint(1, 5)
+            mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+        elif mask_index == 4:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+            start_x = max(center_x - block_size_x // 2, 0)
+            end_x = min(center_x + block_size_x // 2, w)
+            start_y = max(center_y - block_size_y // 2, 0)
+            end_y = min(center_y + block_size_y // 2, h)
+            mask_frame_before = np.random.randint(0, f // 2)
+            mask_frame_after = np.random.randint(f // 2, f)
+            mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+        elif mask_index == 5:
+            mask = torch.randint(0, 2, (f, 1, h, w), dtype=torch.uint8)
+        elif mask_index == 6:
+            num_frames_to_mask = random.randint(1, max(f // 2, 1))
+            frames_to_mask = random.sample(range(f), num_frames_to_mask)
+            for i in frames_to_mask:
+                block_height = random.randint(1, h // 4)
+                block_width = random.randint(1, w // 4)
+                top_left_y = random.randint(0, h - block_height)
+                top_left_x = random.randint(0, w - block_width)
+                mask[i, 0, top_left_y:top_left_y + block_height, top_left_x:top_left_x + block_width] = 1
+        elif mask_index == 7:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            a = torch.randint(min(w, h) // 8, min(w, h) // 4, (1,)).item()  # 长半轴
+            b = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()  # 短半轴
+            for i in range(h):
+                for j in range(w):
+                    if ((i - center_y) ** 2) / (b ** 2) + ((j - center_x) ** 2) / (a ** 2) < 1:
+                        mask[:, :, i, j] = 1
+        elif mask_index == 8:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            radius = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()
+            for i in range(h):
+                for j in range(w):
+                    if (i - center_y) ** 2 + (j - center_x) ** 2 < radius ** 2:
+                        mask[:, :, i, j] = 1
+        elif mask_index == 9:
+            for idx in range(f):
+                if np.random.rand() > 0.5:
+                    mask[idx, :, :, :] = 1
+        else:
+            raise ValueError(f"The mask_index {mask_index} is not define")
+    else:
+        if f != 1:
+            mask[1:, :, :, :] = 1
+        else:
+            mask[:, :, :, :] = 1
+    return mask
+class Camera(object):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def custom_meshgrid(*args):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def get_relative_pose(cam_params):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def ray_condition(K, c2w, H, W, device):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B = K.shape[0]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+def process_pose_file(pose_file_path, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu', return_poses=False):
+    """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    with open(pose_file_path, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    if return_poses:
+        return cam_params
+    else:
+        cam_params = [Camera(cam_param) for cam_param in cam_params]
+        sample_wh_ratio = width / height
+        pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
+        if pose_wh_ratio > sample_wh_ratio:
+            resized_ori_w = height * pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fx = resized_ori_w * cam_param.fx / width
+        else:
+            resized_ori_h = width / pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fy = resized_ori_h * cam_param.fy / height
+        intrinsic = np.asarray([[cam_param.fx * width,
+                                cam_param.fy * height,
+                                cam_param.cx * width,
+                                cam_param.cy * height]
+                                for cam_param in cam_params], dtype=np.float32)
+        K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+        c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
+        c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+        plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
+        plucker_embedding = plucker_embedding[None]
+        plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
+        return plucker_embedding
+def process_pose_params(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu'):
+    """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    sample_wh_ratio = width / height
+    pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
+    if pose_wh_ratio > sample_wh_ratio:
+        resized_ori_w = height * pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fx = resized_ori_w * cam_param.fx / width
+    else:
+        resized_ori_h = width / pose_wh_ratio
+        for cam_param in cam_params:
+            cam_param.fy = resized_ori_h * cam_param.fy / height
+    intrinsic = np.asarray([[cam_param.fx * width,
+                            cam_param.fy * height,
+                            cam_param.cx * width,
+                            cam_param.cy * height]
+                            for cam_param in cam_params], dtype=np.float32)
+    K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+    c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
+    c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+    plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
+    plucker_embedding = plucker_embedding[None]
+    plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
+    return plucker_embedding
+def derive_ground_object_from_instruction(instruction: str) -> str:
+    s = (instruction or '').strip()
+    if not s:
+        return 'the target area'
+    s = s.rstrip('.').strip()
+    # swap/replace: capture phrase between "replace/swap" and "with/by"
+    swap_patterns = [
+        r"\breplace\s+(.*?)\s+(?:with|by)\b",
+        r"\bswap\s+(.*?)\s+with\b",
+    ]
+    for pat in swap_patterns:
+        m = re.search(pat, s, flags=re.IGNORECASE)
+        if m:
+            phrase = m.group(1).strip(' .,:;')
+            if phrase:
+                return phrase
+    # removal: capture object after remove/delete/erase/eliminate up to a preposition or punctuation
+    m = re.search(r"\b(?:remove|delete|erase|eliminate)\s+(.*?)(?:\s+(?:from|in|at|on|over|under|near|by)\b|[.,;]|$)", s, flags=re.IGNORECASE)
+    if m:
+        phrase = m.group(1).strip(' .,:;')
+        if phrase:
+            return phrase
+    # add/insert: generic target area
+    if re.search(r"^\s*(?:add|insert)\b", s, flags=re.IGNORECASE):
+        return 'the target area'
+    # local style (change/make ...): take the immediate noun after determiner
+    m = re.search(r"\b(?:change|make)\s+(?:(the|a|an)\s+)?([A-Za-z][A-Za-z0-9\-]*)", s, flags=re.IGNORECASE)
+    if m:
+        det = m.group(1) or ''
+        noun = m.group(2)
+        phrase = (det + ' ' + noun).strip()
+        return phrase
+    return 'the target area'
+class ImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        # buckets for each aspect ratio
+        self.bucket = {'image':[], 'video':[]}
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset.dataset[idx].get('type', 'image')
+            self.bucket[content_type].append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(self.bucket['video']) == self.batch_size:
+                bucket = self.bucket['video']
+                yield bucket[:]
+                del bucket[:]
+            elif len(self.bucket['image']) == self.batch_size:
+                bucket = self.bucket['image']
+                yield bucket[:]
+                del bucket[:]
+@contextmanager
+def VideoReader_contextmanager(*args, **kwargs):
+    vr = VideoReader(*args, **kwargs)
+    try:
+        yield vr
+    finally:
+        del vr
+        gc.collect()
+def get_video_reader_batch(video_reader, batch_index):
+    frames = video_reader.get_batch(batch_index).asnumpy()
+    return frames
+def resize_frame(frame, target_short_side):
+    h, w, _ = frame.shape
+    if h < w:
+        if target_short_side > h:
+            return frame
+        new_h = target_short_side
+        new_w = int(target_short_side * w / h)
+    else:
+        if target_short_side > w:
+            return frame
+        new_w = target_short_side
+        new_h = int(target_short_side * h / w)
+    resized_frame = cv2.resize(frame, (new_w, new_h))
+    return resized_frame
+class VideoEditDataset(Dataset):
+    def __init__(
+        self,
+        ann_path,
+        data_root=None,
+        video_sample_height: int = None,  # 改为None以支持动态分辨率
+        video_sample_width: int = None,
+        video_sample_stride=1,
+        video_sample_n_frames=65,  # 9+8=17 for your case
+        source_frames=33,
+        edit_frames=32,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        enable_inpaint=False,
+        instruction_template="A video sequence showing two parts: the first half shows the original scene, and the second half shows the same scene but {edit_instruction}",
+    ):
+        dataset = json.load(open(ann_path))
+        if isinstance(dataset, dict):
+            new_dataset = []
+            for vid_id, info in dataset.items():
+                text_content = info["edit_instruction"]
+                new_dataset.append({
+                    "original_video": info["original_video"],
+                    "edited_video": info["edited_video"],
+                    "text": text_content,
+                    "type": info.get("type", "video"),
+                    # 添加分辨率信息到metadata
+                    "resolution": info.get("resolution", None)
+                })
+            dataset = new_dataset
+        self.data_root = data_root
+        self.dataset = dataset
+        self.length = len(self.dataset)
+        self.source_frames = source_frames
+        self.edit_frames = edit_frames
+        self.video_sample_n_frames = video_sample_n_frames
+        self.instruction_template = instruction_template
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint = enable_inpaint
+        self.video_sample_stride = video_sample_stride
+        # 如果启用bucket，不固定分辨率
+        if enable_bucket:
+            self.video_sample_height = None
+            self.video_sample_width = None
+        else:
+            self.video_sample_height = video_sample_height
+            self.video_sample_width = video_sample_width
+    def load_video_pair(self, original_path, edited_path):
+        """加载视频对，保持原始分辨率用于bucket training"""
+        if self.data_root is not None:
+            original_path = os.path.join(self.data_root, original_path)
+            edited_path = os.path.join(self.data_root, edited_path)
+        with VideoReader_contextmanager(original_path, num_threads=2) as orig_reader, \
+             VideoReader_contextmanager(edited_path, num_threads=2) as edit_reader:
+            # 获取视频信息
+            orig_length = len(orig_reader)
+            edit_length = len(edit_reader)
+            min_length = min(orig_length, edit_length)
+            # 统一采样策略
+            start_idx = 0  # 从头开始
+            orig_indices = np.linspace(
+                start_idx,
+                min(start_idx + (self.source_frames - 1) * self.video_sample_stride, orig_length - 1),
+                self.source_frames,
+                dtype=int
+            )
+            edit_indices = np.linspace(
+                start_idx,
+                min(start_idx + (self.edit_frames - 1) * self.video_sample_stride, edit_length - 1),
+                self.edit_frames,
+                dtype=int
+            )
+            # 加载帧
+            orig_frames = get_video_reader_batch(orig_reader, orig_indices)
+            edit_frames = get_video_reader_batch(edit_reader, edit_indices)
+            # 在拼接前对齐两段视频到相同 HxW（缩放后中心裁剪到 min(H1,H2) x min(W1,W2)）
+            def resize_and_center_crop_batch(frames_np, target_h, target_w):
+                resized = []
+                for i in range(frames_np.shape[0]):
+                    frame = frames_np[i]
+                    h, w = frame.shape[0], frame.shape[1]
+                    scale = max(target_h / h, target_w / w)
+                    new_h = int(round(h * scale))
+                    new_w = int(round(w * scale))
+                    frame_resized = cv2.resize(frame, (new_w, new_h))
+                    y0 = max((new_h - target_h) // 2, 0)
+                    x0 = max((new_w - target_w) // 2, 0)
+                    frame_cropped = frame_resized[y0:y0 + target_h, x0:x0 + target_w]
+                    resized.append(frame_cropped)
+                return np.stack(resized, axis=0)
+            oh, ow = orig_frames.shape[1], orig_frames.shape[2]
+            eh, ew = edit_frames.shape[1], edit_frames.shape[2]
+            target_h = min(oh, eh)
+            target_w = min(ow, ew)
+            if (oh != target_h or ow != target_w):
+                orig_frames = resize_and_center_crop_batch(orig_frames, target_h, target_w)
+            if (eh != target_h or ew != target_w):
+                edit_frames = resize_and_center_crop_batch(edit_frames, target_h, target_w)
+            # 如果启用bucket，返回numpy数组
+            if self.enable_bucket:
+                return np.concatenate([orig_frames, edit_frames], axis=0)
+            else:
+                # 转换为tensor并归一化
+                orig_frames = torch.from_numpy(orig_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                edit_frames = torch.from_numpy(edit_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                return torch.cat([orig_frames, edit_frames], dim=0)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        while True:
+            try:
+                # 加载视频对
+                pixel_values = self.load_video_pair(
+                    data_info['original_video'],
+                    data_info['edited_video']
+                )
+                # 准备文本
+                text = data_info['text']
+                if self.instruction_template and "{edit_instruction}" in self.instruction_template:
+                    text = self.instruction_template.format(edit_instruction=text)
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+                sample = {
+                    "pixel_values": pixel_values,
+                    "text": text,
+                    "data_type": "video",
+                    "idx": idx,
+                }
+                # 如果需要inpainting
+                if self.enable_inpaint and not self.enable_bucket:
+                    # 这里添加inpaint逻辑
+                    pass
+                return sample
+            except Exception as e:
+                try:
+                    print(
+                        f"Error loading video pair: {e}\n"
+                        f"  original={os.path.join(self.data_root, data_info.get('original_video','')) if self.data_root else data_info.get('original_video','')}\n"
+                        f"  edited  ={os.path.join(self.data_root, data_info.get('edited_video','')) if self.data_root else data_info.get('edited_video','')}"
+                    )
+                except Exception:
+                    print(f"Error loading video pair: {e}")
+                idx = random.randint(0, self.length-1)
+class VideoEditReasoningDataset(Dataset):
+    def __init__(
+        self,
+        ann_path,
+        data_root=None,
+        video_sample_height: int = None,
+        video_sample_width: int = None,
+        video_sample_stride=1,
+        video_sample_n_frames=65,
+        source_frames=33,
+        reasoning_frames=4,
+        edit_frames=32,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        enable_inpaint=False,
+        instruction_template="A video sequence showing three parts: first the original scene, then grounded {ground_instrction}, and finally the same scene but {edit_instruction}",
+    ):
+        dataset = json.load(open(ann_path))
+        if isinstance(dataset, dict):
+            new_dataset = []
+            for vid_id, info in dataset.items():
+                text_content = info.get("edit_instruction", info.get("text", ""))
+                # support both 'grounded_video' and 'ground_video'
+                grounded_key = "grounded_video" if "grounded_video" in info else "ground_video"
+                new_dataset.append({
+                    "original_video": info["original_video"],
+                    "grounded_video": info[grounded_key],
+                    "edited_video": info["edited_video"],
+                    "text": text_content,
+                    "edit_instruction": text_content,
+                    "type": info.get("type", "video"),
+                    "resolution": info.get("resolution", None),
+                })
+            dataset = new_dataset
+        self.data_root = data_root
+        self.dataset = dataset
+        self.length = len(self.dataset)
+        self.source_frames = source_frames
+        self.reasoning_frames = reasoning_frames
+        self.edit_frames = edit_frames
+        self.video_sample_n_frames = video_sample_n_frames
+        self.instruction_template = instruction_template
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint = enable_inpaint
+        self.video_sample_stride = video_sample_stride
+        if enable_bucket:
+            self.video_sample_height = None
+            self.video_sample_width = None
+        else:
+            self.video_sample_height = video_sample_height
+            self.video_sample_width = video_sample_width
+    def load_video_pair(self, original_path, grounded_path, edited_path):
+        if self.data_root is not None:
+            original_path = os.path.join(self.data_root, original_path)
+            grounded_path = os.path.join(self.data_root, grounded_path)
+            edited_path = os.path.join(self.data_root, edited_path)
+        with VideoReader_contextmanager(original_path, num_threads=2) as orig_reader, \
+             VideoReader_contextmanager(grounded_path, num_threads=2) as ground_reader, \
+             VideoReader_contextmanager(edited_path, num_threads=2) as edit_reader:
+            orig_length = len(orig_reader)
+            ground_length = len(ground_reader)
+            edit_length = len(edit_reader)
+            start_idx = 0
+            orig_indices = np.linspace(
+                start_idx,
+                min(start_idx + (self.source_frames - 1) * self.video_sample_stride, max(orig_length - 1, 0)),
+                self.source_frames,
+                dtype=int
+            )
+            # reasoning/grounded indices at 8-frame interval (example: 0,7,14,21, ...)
+            interval = 8
+            ground_indices_full = np.arange(0, max(ground_length, 1), interval, dtype=int)
+            if len(ground_indices_full) == 0:
+                ground_indices = np.array([0] * self.reasoning_frames, dtype=int)
+            else:
+                ground_indices = ground_indices_full[: self.reasoning_frames]
+                if len(ground_indices) < self.reasoning_frames:
+                    pad_value = ground_indices[-1] if len(ground_indices) > 0 else 0
+                    ground_indices = np.pad(
+                        ground_indices, (0, self.reasoning_frames - len(ground_indices)), constant_values=pad_value
+                    )
+            edit_indices = np.linspace(
+                start_idx,
+                min(start_idx + (self.edit_frames - 1) * self.video_sample_stride, max(edit_length - 1, 0)),
+                self.edit_frames,
+                dtype=int
+            )
+            orig_frames = get_video_reader_batch(orig_reader, orig_indices)
+            ground_frames = get_video_reader_batch(ground_reader, ground_indices)
+            edit_frames = get_video_reader_batch(edit_reader, edit_indices)
+            def resize_and_center_crop_batch(frames_np, target_h, target_w):
+                resized = []
+                for i in range(frames_np.shape[0]):
+                    frame = frames_np[i]
+                    h, w = frame.shape[0], frame.shape[1]
+                    scale = max(target_h / h, target_w / w)
+                    new_h = int(round(h * scale))
+                    new_w = int(round(w * scale))
+                    frame_resized = cv2.resize(frame, (new_w, new_h))
+                    y0 = max((new_h - target_h) // 2, 0)
+                    x0 = max((new_w - target_w) // 2, 0)
+                    frame_cropped = frame_resized[y0:y0 + target_h, x0:x0 + target_w]
+                    resized.append(frame_cropped)
+                return np.stack(resized, axis=0)
+            oh, ow = orig_frames.shape[1], orig_frames.shape[2]
+            gh, gw = ground_frames.shape[1], ground_frames.shape[2]
+            eh, ew = edit_frames.shape[1], edit_frames.shape[2]
+            target_h = min(oh, gh, eh)
+            target_w = min(ow, gw, ew)
+            if (oh != target_h or ow != target_w):
+                orig_frames = resize_and_center_crop_batch(orig_frames, target_h, target_w)
+            if (gh != target_h or gw != target_w):
+                ground_frames = resize_and_center_crop_batch(ground_frames, target_h, target_w)
+            if (eh != target_h or ew != target_w):
+                edit_frames = resize_and_center_crop_batch(edit_frames, target_h, target_w)
+            if self.enable_bucket:
+                return np.concatenate([orig_frames, ground_frames, edit_frames], axis=0)
+            else:
+                orig_frames = torch.from_numpy(orig_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                ground_frames = torch.from_numpy(ground_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                edit_frames = torch.from_numpy(edit_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                return torch.cat([orig_frames, ground_frames, edit_frames], dim=0)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        while True:
+            try:
+                pixel_values = self.load_video_pair(
+                    data_info['original_video'],
+                    data_info.get('grounded_video', data_info.get('ground_video')),
+                    data_info['edited_video'],
+                )
+                # Prepare instructions
+                edit_text = data_info.get('edit_instruction', data_info.get('text', ''))
+                ground_instr = derive_ground_object_from_instruction(edit_text)
+                text = edit_text
+                if self.instruction_template:
+                    text = self.instruction_template.format(edit_instruction=edit_text, ground_instrction=ground_instr)
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+                sample = {
+                    "pixel_values": pixel_values,
+                    "text": text,
+                    "data_type": "video",
+                    "idx": idx,
+                }
+                if self.enable_inpaint and not self.enable_bucket:
+                    pass
+                return sample
+            except Exception as e:
+                print(f"Error loading video triplet: {e}")
+                idx = random.randint(0, self.length-1)
+class ImageVideoDataset(Dataset):
+    def __init__(
+        self,
+        ann_path, data_root=None,
+        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+        image_sample_size=512,
+        video_repeat=0,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        video_length_drop_start=0.0,
+        video_length_drop_end=1.0,
+        enable_inpaint=False,
+        return_file_name=False,
+    ):
+        # Loading annotations from files
+        print(f"loading annotations from {ann_path} ...")
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        self.data_root = data_root
+        # It's used to balance num of images and videos.
+        if video_repeat > 0:
+            self.dataset = []
+            for data in dataset:
+                if data.get('type', 'image') != 'video':
+                    self.dataset.append(data)
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        else:
+            self.dataset = dataset
+        del dataset
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint = enable_inpaint
+        self.return_file_name = return_file_name
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        if data_info.get('type', 'image')=='video':
+            video_id, text = data_info['file_path'], data_info['text']
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames,
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            return pixel_values, text, 'video', video_dir
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            return image, text, 'image', image_path
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+                pixel_values, name, data_type, file_path = self.get_batch(idx)
+                sample["pixel_values"] = pixel_values
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+                if self.return_file_name:
+                    sample["file_name"] = os.path.basename(file_path)
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+class ImageVideoEditDataset(Dataset):
+    def __init__(
+        self,
+        ann_path,
+        data_root=None,
+        video_sample_size=512,
+        video_sample_stride=1,
+        source_frames=33,
+        target_frames=32,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        enable_inpaint=False,
+        video_length_drop_start=0.0,
+        video_length_drop_end=1.0,
+        instruction_template="A video sequence showing two parts: the first half shows the original scene, and the second half shows the same scene but {edit_instruction}",
+    ):
+        dataset = json.load(open(ann_path))
+        if isinstance(dataset, dict):
+            new_dataset = []
+            for _, info in dataset.items():
+                # Keep original keys, just standardize text field
+                data_type = info.get("type", "video")
+                entry = dict(info)  # Copy original entry
+                # Standardize text field name and handle None/empty values
+                if "edit_instruction" in entry:
+                    entry["text"] = entry["edit_instruction"]
+                elif "instruction" in entry:
+                    entry["text"] = entry["instruction"]
+                elif "text" not in entry:
+                    entry["text"] = ""
+                # Ensure text is not None (convert None to empty string)
+                if entry["text"] is None:
+                    entry["text"] = ""
+                # Add file_path for bucket sampler compatibility
+                # Bucket sampler expects 'file_path' to get dimensions
+                if data_type == "video":
+                    entry["file_path"] = entry.get("original_video", "")
+                else:  # image
+                    entry["file_path"] = entry.get("original_image", "")
+                new_dataset.append(entry)
+            dataset = new_dataset
+        self.data_root = data_root
+        self.dataset = dataset
+        self.length = len(self.dataset)
+        # sampling params
+        self.video_sample_stride = video_sample_stride
+        self.source_frames = source_frames
+        self.target_frames = target_frames
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+        # transforms params (match ImageVideoDataset)
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        # Image transforms for non-bucket mode
+        self.image_transforms = transforms.Compose([
+            transforms.Resize(min(self.video_sample_size)),
+            transforms.CenterCrop(self.video_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+        self.instruction_template = instruction_template
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint = enable_inpaint
+        # For pre-resize like ImageVideoDataset
+        self.larger_side_of_image_and_video = min(self.video_sample_size)
+    def _resize_and_center_crop_batch(self, frames_np, target_h, target_w):
+        resized = []
+        for i in range(frames_np.shape[0]):
+            frame = frames_np[i]
+            h, w = frame.shape[0], frame.shape[1]
+            scale = max(target_h / h, target_w / w)
+            new_h = int(round(h * scale))
+            new_w = int(round(w * scale))
+            frame_resized = cv2.resize(frame, (new_w, new_h))
+            y0 = max((new_h - target_h) // 2, 0)
+            x0 = max((new_w - target_w) // 2, 0)
+            frame_cropped = frame_resized[y0:y0 + target_h, x0:x0 + target_w]
+            resized.append(frame_cropped)
+        return np.stack(resized, axis=0)
+    def _resize_and_center_crop_image(self, image_np, target_h, target_w):
+        h, w = image_np.shape[0], image_np.shape[1]
+        scale = max(target_h / h, target_w / w)
+        new_h = int(round(h * scale))
+        new_w = int(round(w * scale))
+        image_resized = cv2.resize(image_np, (new_w, new_h))
+        y0 = max((new_h - target_h) // 2, 0)
+        x0 = max((new_w - target_w) // 2, 0)
+        image_cropped = image_resized[y0:y0 + target_h, x0:x0 + target_w]
+        return image_cropped
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'video')
+        # Handle None or empty instruction with safety fallback
+        raw_text = data_info.get('text', '')
+        if raw_text is None or (isinstance(raw_text, str) and not raw_text.strip()):
+            # Use a generic fallback description if instruction is missing
+            raw_text = "the content has been modified"
+        # Apply instruction template if available
+        if self.instruction_template and "{edit_instruction}" in self.instruction_template:
+            text = self.instruction_template.format(edit_instruction=raw_text)
+        else:
+            text = raw_text
+        if data_type == 'video':
+            # video pair branch (default)
+            src_rel, tgt_rel = data_info['original_video'], data_info['edited_video']
+            if self.data_root is not None:
+                src_path = os.path.join(self.data_root, src_rel)
+                tgt_path = os.path.join(self.data_root, tgt_rel)
+            else:
+                src_path = src_rel
+                tgt_path = tgt_rel
+            # Force use CPU decoder to read all frames instead of just keyframes
+            from decord import cpu
+            with VideoReader_contextmanager(src_path, num_threads=2, ctx=cpu(0)) as src_reader, \
+                 VideoReader_contextmanager(tgt_path, num_threads=2, ctx=cpu(0)) as tgt_reader:
+                # Get video lengths
+                src_length = len(src_reader)
+                tgt_length = len(tgt_reader)
+                # Check if video has enough frames
+                if src_length < self.source_frames:
+                    raise ValueError(f"Source video only has {src_length} frames, but requested {self.source_frames}")
+                if tgt_length < self.target_frames:
+                    raise ValueError(f"Target video only has {tgt_length} frames, but requested {self.target_frames}")
+                # Unified sampling strategy: start from beginning (same as VideoEditDataset)
+                start_idx = 0
+                src_indices = np.linspace(
+                    start_idx,
+                    min(start_idx + (self.source_frames - 1) * self.video_sample_stride, src_length - 1),
+                    self.source_frames,
+                    dtype=int
+                )
+                tgt_indices = np.linspace(
+                    start_idx,
+                    min(start_idx + (self.target_frames - 1) * self.video_sample_stride, tgt_length - 1),
+                    self.target_frames,
+                    dtype=int
+                )
+                # read batches with timeout
+                try:
+                    src_frames = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(src_reader, src_indices))
+                    tgt_frames = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(tgt_reader, tgt_indices))
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from pair. Error is {e}.")
+                # align HxW between source and target to enable concat
+                sh, sw = src_frames.shape[1], src_frames.shape[2]
+                th, tw = tgt_frames.shape[1], tgt_frames.shape[2]
+                target_h = min(sh, th)
+                target_w = min(sw, tw)
+                if (sh != target_h or sw != target_w):
+                    src_frames = self._resize_and_center_crop_batch(src_frames, target_h, target_w)
+                if (th != target_h or tw != target_w):
+                    tgt_frames = self._resize_and_center_crop_batch(tgt_frames, target_h, target_w)
+                if not self.enable_bucket:
+                    src_tensor = torch.from_numpy(src_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                    tgt_tensor = torch.from_numpy(tgt_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                    src_tensor = self.video_transforms(src_tensor)
+                    tgt_tensor = self.video_transforms(tgt_tensor)
+                else:
+                    src_tensor = src_frames
+                    tgt_tensor = tgt_frames
+                # Random text drop
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            return src_tensor, tgt_tensor, text, 'video'
+        else:
+            # image pair branch (simple like ImageVideoDataset image path)
+            src_img_rel = data_info.get('original_image')
+            tgt_img_rel = data_info.get('edited_image')
+            if src_img_rel is None or tgt_img_rel is None:
+                raise ValueError('Missing original_image/edited_image for image sample')
+            if self.data_root is not None:
+                src_img_path = os.path.join(self.data_root, src_img_rel)
+                tgt_img_path = os.path.join(self.data_root, tgt_img_rel)
+            else:
+                src_img_path = src_img_rel
+                tgt_img_path = tgt_img_rel
+            src_img = Image.open(src_img_path).convert('RGB')
+            tgt_img = Image.open(tgt_img_path).convert('RGB')
+            if not self.enable_bucket:
+                # Apply transforms and add frame dimension
+                src_tensor = self.image_transforms(src_img).unsqueeze(0)  # (1, C, H, W)
+                tgt_tensor = self.image_transforms(tgt_img).unsqueeze(0)  # (1, C, H, W)
+            else:
+                # For bucket mode, keep as numpy and add frame dimension
+                src_tensor = np.expand_dims(np.array(src_img), axis=0)  # (1, H, W, C)
+                tgt_tensor = np.expand_dims(np.array(tgt_img), axis=0)  # (1, H, W, C)
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            return src_tensor, tgt_tensor, text, 'image'
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'video')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'video')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+                src_vals, tgt_vals, name, data_type = self.get_batch(idx)
+                if data_type == 'video':
+                    sample["pixel_values_src_video"] = src_vals
+                    sample["pixel_values_tgt_video"] = tgt_vals
+                else:
+                    sample["pixel_values_src_image"] = src_vals
+                    sample["pixel_values_tgt_image"] = tgt_vals
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+        # Inpaint not applied here to avoid ambiguity across src/tgt branches
+        return sample
+class ImageVideoCoTDataset(Dataset):
+    """
+    Dataset for Chain-of-Thought (CoT) style image/video editing.
+    - For videos: loads original_video, grounded_video, and edited_video (3-part)
+    - For images: loads original_image and edited_image (2-part, same as ImageVideoEditDataset)
+    """
+    def __init__(
+        self,
+        ann_path,
+        data_root=None,
+        video_sample_size=512,
+        video_sample_stride=1,
+        source_frames=33,
+        reasoning_frames=4,
+        target_frames=33,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        enable_inpaint=False,
+        video_length_drop_start=0.0,
+        video_length_drop_end=1.0,
+        instruction_template="A video sequence showing three parts: first the original scene, then grounded {ground_instruction}, and finally the same scene but {edit_instruction}",
+        enable_gradual_ground=False,
+        enable_gray_red_mask=False,
+        enable_gray_black_background=False,
+        enable_gray_alpha_overlay=False,
+        gray_alpha=0.5,
+        gray_intensity_range=(96, 160),
+        gray_tolerance=12,
+    ):
+        dataset = json.load(open(ann_path))
+        if isinstance(dataset, dict):
+            new_dataset = []
+            for _, info in dataset.items():
+                data_type = info.get("type", "video")
+                entry = dict(info)  # Copy original entry
+                # Standardize text field name and handle None/empty values
+                if "edit_instruction" in entry:
+                    entry["text"] = entry["edit_instruction"]
+                elif "instruction" in entry:
+                    entry["text"] = entry["instruction"]
+                elif "text" not in entry:
+                    entry["text"] = ""
+                # Ensure text is not None
+                if entry["text"] is None:
+                    entry["text"] = ""
+                # Add file_path for bucket sampler compatibility
+                if data_type == "video":
+                    entry["file_path"] = entry.get("original_video", "")
+                else:  # image
+                    entry["file_path"] = entry.get("original_image", "")
+                new_dataset.append(entry)
+            dataset = new_dataset
+        self.data_root = data_root
+        self.dataset = dataset
+        self.length = len(self.dataset)
+        # sampling params
+        self.video_sample_stride = video_sample_stride
+        self.source_frames = source_frames
+        self.reasoning_frames = reasoning_frames
+        self.target_frames = target_frames
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+        # transforms params
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        # Image transforms for non-bucket mode
+        self.image_transforms = transforms.Compose([
+            transforms.Resize(min(self.video_sample_size)),
+            transforms.CenterCrop(self.video_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+        self.instruction_template = instruction_template
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint = enable_inpaint
+        self.enable_gradual_ground = enable_gradual_ground
+        # only one visualization mode at a time
+        enabled_modes = int(bool(enable_gray_red_mask)) + int(bool(enable_gray_black_background)) + int(bool(enable_gray_alpha_overlay))
+        if enabled_modes > 1:
+            raise ValueError("enable_gray_red_mask, enable_gray_black_background and enable_gray_alpha_overlay cannot be enabled simultaneously.")
+        self.enable_gray_red_mask = enable_gray_red_mask
+        self.enable_gray_black_background = enable_gray_black_background
+        self.enable_gray_alpha_overlay = enable_gray_alpha_overlay
+        self.gray_alpha = float(gray_alpha)
+        if not (0.0 <= self.gray_alpha <= 1.0):
+            raise ValueError("gray_alpha must be in [0,1].")
+        if not isinstance(gray_intensity_range, (list, tuple)) or len(gray_intensity_range) != 2:
+            raise ValueError("gray_intensity_range must contain exactly two values (min and max intensity).")
+        self.gray_intensity_range = (int(gray_intensity_range[0]), int(gray_intensity_range[1]))
+        if self.gray_intensity_range[0] > self.gray_intensity_range[1]:
+            raise ValueError("gray_intensity_range min value cannot be greater than max value.")
+        self.gray_tolerance = int(gray_tolerance)
+        # For pre-resize like ImageVideoDataset
+        self.larger_side_of_image_and_video = min(self.video_sample_size)
+    def _resize_and_center_crop_batch(self, frames_np, target_h, target_w):
+        resized = []
+        for i in range(frames_np.shape[0]):
+            frame = frames_np[i]
+            h, w = frame.shape[0], frame.shape[1]
+            scale = max(target_h / h, target_w / w)
+            new_h = int(round(h * scale))
+            new_w = int(round(w * scale))
+            frame_resized = cv2.resize(frame, (new_w, new_h))
+            y0 = max((new_h - target_h) // 2, 0)
+            x0 = max((new_w - target_w) // 2, 0)
+            frame_cropped = frame_resized[y0:y0 + target_h, x0:x0 + target_w]
+            resized.append(frame_cropped)
+        return np.stack(resized, axis=0)
+    def _resize_and_center_crop_image(self, image_np, target_h, target_w):
+        h, w = image_np.shape[0], image_np.shape[1]
+        scale = max(target_h / h, target_w / w)
+        new_h = int(round(h * scale))
+        new_w = int(round(w * scale))
+        image_resized = cv2.resize(image_np, (new_w, new_h))
+        y0 = max((new_h - target_h) // 2, 0)
+        x0 = max((new_w - target_w) // 2, 0)
+        image_cropped = image_resized[y0:y0 + target_h, x0:x0 + target_w]
+        return image_cropped
+    def _derive_ground_instruction(self, edit_instruction_text: str) -> str:
+        """Derive grounded object phrase from instruction using shared rules."""
+        return derive_ground_object_from_instruction(edit_instruction_text)
+    def _ensure_same_size_pair(self, img_a: np.ndarray, img_b: np.ndarray) -> tuple:
+        """Resize img_b to img_a's size if needed to enable per-pixel interpolation."""
+        ha, wa = img_a.shape[:2]
+        hb, wb = img_b.shape[:2]
+        if (ha, wa) == (hb, wb):
+            return img_a, img_b
+        resized_b = cv2.resize(img_b, (wa, ha), interpolation=cv2.INTER_LINEAR)
+        return img_a, resized_b
+    def _interpolate_ground_frames(self, ground_first: np.ndarray, target_first: np.ndarray,
+                                   total_steps: int = 16,
+                                   pick_indices: tuple = (0, 4, 8, 12)) -> np.ndarray:
+        """
+        Create grounding frames by linearly interpolating between the first frame of
+        the grounding video and the first frame of the edited video, then picking
+        specific indices.
+        Returns array of shape (len(pick_indices), H, W, 3) in uint8.
+        """
+        a_np, b_np = self._ensure_same_size_pair(ground_first, target_first)
+        a_t = torch.from_numpy(a_np).float() / 255.0  # H, W, C
+        b_t = torch.from_numpy(b_np).float() / 255.0  # H, W, C
+        a_t = a_t.permute(2, 0, 1).contiguous()  # C, H, W
+        b_t = b_t.permute(2, 0, 1).contiguous()  # C, H, W
+        c, h, w = a_t.shape
+        pair = torch.stack([a_t, b_t], dim=0)  # 2, C, H, W
+        pair_chw_t = pair.permute(1, 2, 3, 0).contiguous()  # C, H, W, 2
+        seq = pair_chw_t.view(1, c * h * w, 2)  # 1, (C*H*W), 2
+        with torch.no_grad():
+            seq_interp = F.interpolate(seq, size=int(total_steps), mode="linear", align_corners=True)
+        seq_interp = seq_interp.view(c, h, w, int(total_steps)).permute(3, 0, 1, 2).contiguous()  # T, C, H, W
+        out_frames = []
+        t_steps = int(total_steps)
+        for idx in pick_indices:
+            safe_idx = max(0, min(int(idx), t_steps - 1))
+            img = (seq_interp[safe_idx].clamp(0.0, 1.0) * 255.0).byte().permute(1, 2, 0).cpu().numpy()
+            out_frames.append(img)
+        return np.stack(out_frames, axis=0)
+    def _build_gray_mask(self, frame: np.ndarray) -> np.ndarray:
+        """Detect gray regions in a frame using intensity range and tolerance."""
+        frame_float = frame.astype(np.float32)
+        if frame_float.max() <= 1.0:
+            frame_float = frame_float * 255.0
+        channel_max = frame_float.max(axis=2)
+        channel_min = frame_float.min(axis=2)
+        min_intensity, max_intensity = self.gray_intensity_range
+        tone_flatness = channel_max - channel_min
+        mask = tone_flatness <= float(self.gray_tolerance)
+        mask &= channel_max >= float(min_intensity)
+        mask &= channel_max <= float(max_intensity)
+        return mask
+    def _apply_gray_region_effect(self, frames_np: np.ndarray, mode: str) -> np.ndarray:
+        """Apply requested effect on detected gray regions for a batch of frames."""
+        processed_frames = []
+        for frame in frames_np:
+            mask = self._build_gray_mask(frame)
+            if not np.any(mask):
+                processed_frames.append(frame)
+                continue
+            frame_out = frame.copy()
+            if np.issubdtype(frame_out.dtype, np.floating) and frame_out.max() <= 1.0:
+                red_value = np.array([1.0, 0.0, 0.0], dtype=frame_out.dtype)
+            else:
+                red_value = np.array([255, 0, 0], dtype=frame_out.dtype)
+            if mode == "red":
+                frame_out[mask] = red_value
+            else:
+                frame_out[:] = 0
+                frame_out[mask] = frame[mask]
+            processed_frames.append(frame_out)
+        return np.stack(processed_frames, axis=0)
+    def _apply_gray_overlay_from_reference(self, src_frames_np: np.ndarray, ref_frames_np: np.ndarray,
+                                           alpha: float = 0.5, gray_value: float = 0.5, num_frames: int = 4) -> np.ndarray:
+        """
+        Detect gray regions on ref frames, and overlay gray with alpha onto the
+        first `num_frames` frames of src frames at the same positions.
+        """
+        n = min(int(num_frames), int(src_frames_np.shape[0]), int(ref_frames_np.shape[0]))
+        if n <= 0:
+            return src_frames_np
+        out = src_frames_np.copy()
+        a = float(alpha)
+        a = 0.0 if a < 0.0 else (1.0 if a > 1.0 else a)
+        gv = float(gray_value)
+        gv = 0.0 if gv < 0.0 else (1.0 if gv > 1.0 else gv)
+        for i in range(n):
+            mask = self._build_gray_mask(ref_frames_np[i])
+            if not np.any(mask):
+                continue
+            src = out[i]
+            # normalize to 0..1 float
+            if np.issubdtype(src.dtype, np.floating):
+                f = src.astype(np.float32)
+                if f.max() > 1.0:
+                    f = np.clip(f / 255.0, 0.0, 1.0)
+                back_to_uint8 = False
+            else:
+                f = src.astype(np.float32) / 255.0
+                back_to_uint8 = True
+            gray_color = np.array([gv, gv, gv], dtype=np.float32)
+            # boolean mask is (H,W); f[mask] -> (K,3), broadcast with gray_color (3,)
+            f[mask] = (1.0 - a) * f[mask] + a * gray_color
+            if back_to_uint8:
+                out[i] = (f * 255.0).clip(0, 255).astype(src.dtype)
+            else:
+                out[i] = f.astype(src.dtype)
+        return out
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'video')
+        # Handle None or empty instruction with safety fallback
+        raw_text = data_info.get('text', '')
+        if raw_text is None or (isinstance(raw_text, str) and not raw_text.strip()):
+            raw_text = "the content has been modified"
+        if data_type == 'video':
+            # Video triplet branch: original + grounded + edited
+            src_rel = data_info['original_video']
+            # Support both 'grounded_video' and 'ground_video' keys
+            ground_rel = data_info.get('grounded_video', data_info.get('ground_video'))
+            tgt_rel = data_info['edited_video']
+            if self.data_root is not None:
+                src_path = os.path.join(self.data_root, src_rel)
+                ground_path = os.path.join(self.data_root, ground_rel)
+                tgt_path = os.path.join(self.data_root, tgt_rel)
+            else:
+                src_path = src_rel
+                ground_path = ground_rel
+                tgt_path = tgt_rel
+            # Force use CPU decoder to read all frames
+            from decord import cpu
+            with VideoReader_contextmanager(src_path, num_threads=2, ctx=cpu(0)) as src_reader, \
+                 VideoReader_contextmanager(ground_path, num_threads=2, ctx=cpu(0)) as ground_reader, \
+                 VideoReader_contextmanager(tgt_path, num_threads=2, ctx=cpu(0)) as tgt_reader:
+                # Get video lengths
+                src_length = len(src_reader)
+                ground_length = len(ground_reader)
+                tgt_length = len(tgt_reader)
+                # Check if video has enough frames
+                if src_length < self.source_frames:
+                    raise ValueError(f"Source video only has {src_length} frames, but requested {self.source_frames}")
+                if tgt_length < self.target_frames:
+                    raise ValueError(f"Target video only has {tgt_length} frames, but requested {self.target_frames}")
+                # Unified sampling strategy: start from beginning
+                start_idx = 0
+                # Sample source frames
+                src_indices = np.linspace(
+                    start_idx,
+                    min(start_idx + (self.source_frames - 1) * self.video_sample_stride, src_length - 1),
+                    self.source_frames,
+                    dtype=int
+                )
+                # Sample target frames
+                tgt_indices = np.linspace(
+                    start_idx,
+                    min(start_idx + (self.target_frames - 1) * self.video_sample_stride, tgt_length - 1),
+                    self.target_frames,
+                    dtype=int
+                )
+                # Read batches with timeout
+                try:
+                    src_frames = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(src_reader, src_indices))
+                    tgt_frames = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(tgt_reader, tgt_indices))
+                    if self.enable_gradual_ground:
+                        # Interpolate between first frame of grounded and edited videos
+                        ground_first = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(ground_reader, [0]))
+                        # Use the first decoded edited frame if available to avoid double decode
+                        tgt_first_frame = tgt_frames[0]
+                        # steps: 0..15, pick 0,3,6,9,12 -> 5 grounding frames
+                        ground_frames = self._interpolate_ground_frames(
+                            ground_first=ground_first[0],
+                            target_first=tgt_first_frame,
+                            total_steps=16,
+                            pick_indices=(0, 3, 6, 9, 12),
+                        )
+                    else:
+                        # # Original behavior: sample grounding frames evenly by stride
+                        # ground_indices = np.linspace(
+                        #     start_idx,
+                        #     min(start_idx + (self.reasoning_frames - 1) * self.video_sample_stride, ground_length - 1),
+                        #     self.reasoning_frames,
+                        #     dtype=int
+                        # )
+                        #==============================================================
+                        # New behavior: ground_indices are the first 'reasoning_frames' from src_indices
+                        ground_indices = src_indices[:self.reasoning_frames]
+                        # --- 增加这个重要的安全检查 ---
+                        # 确保我们想采样的最后一帧 (ground_indices[-1])
+                        # 没有超出 ground_video 的总长度 (ground_length)
+                        if len(ground_indices) > 0 and ground_indices[-1] >= ground_length:
+                            raise ValueError(
+                                f"Data inconsistency error: Ground video has only {ground_length} frames, "
+                                f"but the source-based sampling (stride={self.video_sample_stride}) "
+                                f"requires reading up to frame {ground_indices[-1]}. "
+                                f"File: {ground_path}"
+                            )
+                        ground_frames = func_timeout(VIDEO_READER_TIMEOUT, get_video_reader_batch, args=(ground_reader, ground_indices))
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from triplet. Error is {e}.")
+                # Align HxW among source, ground, and target to enable concat
+                sh, sw = src_frames.shape[1], src_frames.shape[2]
+                gh, gw = ground_frames.shape[1], ground_frames.shape[2]
+                th, tw = tgt_frames.shape[1], tgt_frames.shape[2]
+                target_h = min(sh, gh, th)
+                target_w = min(sw, gw, tw)
+                if (sh != target_h or sw != target_w):
+                    src_frames = self._resize_and_center_crop_batch(src_frames, target_h, target_w)
+                if (gh != target_h or gw != target_w):
+                    ground_frames = self._resize_and_center_crop_batch(ground_frames, target_h, target_w)
+                if (th != target_h or tw != target_w):
+                    tgt_frames = self._resize_and_center_crop_batch(tgt_frames, target_h, target_w)
+                if self.enable_gray_red_mask or self.enable_gray_black_background:
+                    effect_mode = "red" if self.enable_gray_red_mask else "black"
+                    ground_frames = self._apply_gray_region_effect(ground_frames, effect_mode)
+                elif self.enable_gray_alpha_overlay:
+                    # Use gray regions detected on grounding frames to overlay 50% gray on the
+                    # first 4 frames of the original video.
+                    ground_frames = self._apply_gray_overlay_from_reference(
+                        src_frames, ground_frames, alpha=self.gray_alpha, gray_value=0.5, num_frames=4
+                    )
+                if not self.enable_bucket:
+                    src_tensor = torch.from_numpy(src_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                    ground_tensor = torch.from_numpy(ground_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                    tgt_tensor = torch.from_numpy(tgt_frames).permute(0, 3, 1, 2).contiguous() / 255.
+                    src_tensor = self.video_transforms(src_tensor)
+                    ground_tensor = self.video_transforms(ground_tensor)
+                    tgt_tensor = self.video_transforms(tgt_tensor)
+                else:
+                    src_tensor = src_frames
+                    ground_tensor = ground_frames
+                    tgt_tensor = tgt_frames
+                # Prepare text with template
+                ground_instr = self._derive_ground_instruction(raw_text)
+                if self.instruction_template and "{edit_instruction}" in self.instruction_template:
+                    text = self.instruction_template.format(
+                        edit_instruction=raw_text,
+                        ground_instruction=ground_instr
+                    )
+                else:
+                    text = raw_text
+                # Random text drop
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            return src_tensor, ground_tensor, tgt_tensor, text, 'video'
+        else:
+            # Image pair branch (simple like ImageVideoEditDataset)
+            src_img_rel = data_info.get('original_image')
+            tgt_img_rel = data_info.get('edited_image')
+            if src_img_rel is None or tgt_img_rel is None:
+                raise ValueError('Missing original_image/edited_image for image sample')
+            if self.data_root is not None:
+                src_img_path = os.path.join(self.data_root, src_img_rel)
+                tgt_img_path = os.path.join(self.data_root, tgt_img_rel)
+            else:
+                src_img_path = src_img_rel
+                tgt_img_path = tgt_img_rel
+            src_img = Image.open(src_img_path).convert('RGB')
+            tgt_img = Image.open(tgt_img_path).convert('RGB')
+            if not self.enable_bucket:
+                # Apply transforms and add frame dimension
+                src_tensor = self.image_transforms(src_img).unsqueeze(0)  # (1, C, H, W)
+                tgt_tensor = self.image_transforms(tgt_img).unsqueeze(0)  # (1, C, H, W)
+            else:
+                # For bucket mode, keep as numpy and add frame dimension
+                src_tensor = np.expand_dims(np.array(src_img), axis=0)  # (1, H, W, C)
+                tgt_tensor = np.expand_dims(np.array(tgt_img), axis=0)  # (1, H, W, C)
+            # Apply instruction template if available
+            if self.instruction_template and "{edit_instruction}" in self.instruction_template:
+                text = self.instruction_template.format(edit_instruction=raw_text, ground_instruction="")
+            else:
+                text = raw_text
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            # For images, ground_tensor is None
+            return src_tensor, None, tgt_tensor, text, 'image'
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'video')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'video')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+                result = self.get_batch(idx)
+                if data_type == 'video':
+                    src_vals, ground_vals, tgt_vals, name, data_type = result
+                    sample["pixel_values_src_video"] = src_vals
+                    sample["pixel_values_ground_video"] = ground_vals
+                    sample["pixel_values_tgt_video"] = tgt_vals
+                else:
+                    src_vals, _, tgt_vals, name, data_type = result
+                    sample["pixel_values_src_image"] = src_vals
+                    sample["pixel_values_tgt_image"] = tgt_vals
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+        return sample
+def padding_image(images, new_width, new_height):
+    new_image = Image.new('RGB', (new_width, new_height), (255, 255, 255))
+    aspect_ratio = images.width / images.height
+    if new_width / new_height > 1:
+        if aspect_ratio > new_width / new_height:
+            new_img_width = new_width
+            new_img_height = int(new_img_width / aspect_ratio)
+        else:
+            new_img_height = new_height
+            new_img_width = int(new_img_height * aspect_ratio)
+    else:
+        if aspect_ratio > new_width / new_height:
+            new_img_width = new_width
+            new_img_height = int(new_img_width / aspect_ratio)
+        else:
+            new_img_height = new_height
+            new_img_width = int(new_img_height * aspect_ratio)
+    resized_img = images.resize((new_img_width, new_img_height))
+    paste_x = (new_width - new_img_width) // 2
+    paste_y = (new_height - new_img_height) // 2
+    new_image.paste(resized_img, (paste_x, paste_y))
+    return new_image
+class ImageVideoControlDataset(Dataset):
+    def __init__(
+        self,
+        ann_path, data_root=None,
+        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+        image_sample_size=512,
+        video_repeat=0,
+        text_drop_ratio=0.1,
+        enable_bucket=False,
+        video_length_drop_start=0.1,
+        video_length_drop_end=0.9,
+        enable_inpaint=False,
+        enable_camera_info=False,
+    ):
+        # Loading annotations from files
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        self.data_root = data_root
+        # It's used to balance num of images and videos.
+        if video_repeat > 0:
+            self.dataset = []
+            for data in dataset:
+                if data.get('type', 'image') != 'video':
+                    self.dataset.append(data)
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        else:
+            self.dataset = dataset
+        del dataset
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
+        self.enable_camera_info = enable_camera_info
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+        if self.enable_camera_info:
+            self.video_transforms_camera = transforms.Compose(
+                [
+                    transforms.Resize(min(self.video_sample_size)),
+                    transforms.CenterCrop(self.video_sample_size)
+                ]
+            )
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        video_id, text = data_info['file_path'], data_info['text']
+        if data_info.get('type', 'image')=='video':
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames,
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            control_video_id = data_info['control_file_path']
+            if self.data_root is None:
+                control_video_id = control_video_id
+            else:
+                control_video_id = os.path.join(self.data_root, control_video_id)
+            if self.enable_camera_info:
+                if control_video_id.lower().endswith('.txt'):
+                    if not self.enable_bucket:
+                        control_pixel_values = torch.zeros_like(pixel_values)
+                        control_camera_values = process_pose_file(control_video_id, width=self.video_sample_size[1], height=self.video_sample_size[0])
+                        control_camera_values = torch.from_numpy(control_camera_values).permute(0, 3, 1, 2).contiguous()
+                        control_camera_values = F.interpolate(control_camera_values, size=(len(video_reader), control_camera_values.size(3)), mode='bilinear', align_corners=True)
+                        control_camera_values = self.video_transforms_camera(control_camera_values)
+                    else:
+                        control_pixel_values = np.zeros_like(pixel_values)
+                        control_camera_values = process_pose_file(control_video_id, width=self.video_sample_size[1], height=self.video_sample_size[0], return_poses=True)
+                        control_camera_values = torch.from_numpy(np.array(control_camera_values)).unsqueeze(0).unsqueeze(0)
+                        control_camera_values = F.interpolate(control_camera_values, size=(len(video_reader), control_camera_values.size(3)), mode='bilinear', align_corners=True)[0][0]
+                        control_camera_values = np.array([control_camera_values[index] for index in batch_index])
+                else:
+                    if not self.enable_bucket:
+                        control_pixel_values = torch.zeros_like(pixel_values)
+                        control_camera_values = None
+                    else:
+                        control_pixel_values = np.zeros_like(pixel_values)
+                        control_camera_values = None
+            else:
+                with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
+                    try:
+                        sample_args = (control_video_reader, batch_index)
+                        control_pixel_values = func_timeout(
+                            VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                        )
+                        resized_frames = []
+                        for i in range(len(control_pixel_values)):
+                            frame = control_pixel_values[i]
+                            resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                            resized_frames.append(resized_frame)
+                        control_pixel_values = np.array(resized_frames)
+                    except FunctionTimedOut:
+                        raise ValueError(f"Read {idx} timeout.")
+                    except Exception as e:
+                        raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+                    if not self.enable_bucket:
+                        control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
+                        control_pixel_values = control_pixel_values / 255.
+                        del control_video_reader
+                    else:
+                        control_pixel_values = control_pixel_values
+                    if not self.enable_bucket:
+                        control_pixel_values = self.video_transforms(control_pixel_values)
+                control_camera_values = None
+            return pixel_values, control_pixel_values, control_camera_values, text, "video"
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            control_image_id = data_info['control_file_path']
+            if self.image_root is None:
+                control_image_id = control_image_id
+            else:
+                control_image_id = os.path.join(self.image_root, control_image_id)
+            control_image = Image.open(control_image_id).convert('RGB')
+            if not self.enable_bucket:
+                control_image = self.image_transforms(control_image).unsqueeze(0)
+            else:
+                control_image = np.expand_dims(np.array(control_image), 0)
+            return image, control_image, None, text, 'image'
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+                pixel_values, control_pixel_values, control_camera_values, name, data_type = self.get_batch(idx)
+                sample["pixel_values"] = pixel_values
+                sample["control_pixel_values"] = control_pixel_values
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+                if self.enable_camera_info:
+                    sample["control_camera_values"] = control_camera_values
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+        if self.enable_inpaint and not self.enable_bucket:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.zeros_like(pixel_values) * mask
+            sample["mask_pixel_values"] = mask_pixel_values
+            sample["mask"] = mask
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
+        return sample

videox_fun/data/dataset_video.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import csv
+import gc
+import io
+import json
+import math
+import os
+import random
+from contextlib import contextmanager
+from threading import Thread
+import albumentations
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from decord import VideoReader
+from einops import rearrange
+from func_timeout import FunctionTimedOut, func_timeout
+from PIL import Image
+from torch.utils.data import BatchSampler, Sampler
+from torch.utils.data.dataset import Dataset
+VIDEO_READER_TIMEOUT = 20
+def get_random_mask(shape):
+    f, c, h, w = shape
+    mask_index = np.random.randint(0, 4)
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+    if mask_index == 0:
+        mask[1:, :, :, :] = 1
+    elif mask_index == 1:
+        mask_frame_index = 1
+        mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+    elif mask_index == 2:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask[:, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 3:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask_frame_before = np.random.randint(0, f // 2)
+        mask_frame_after = np.random.randint(f // 2, f)
+        mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+    else:
+        raise ValueError(f"The mask_index {mask_index} is not define")
+    return mask
+@contextmanager
+def VideoReader_contextmanager(*args, **kwargs):
+    vr = VideoReader(*args, **kwargs)
+    try:
+        yield vr
+    finally:
+        del vr
+        gc.collect()
+def get_video_reader_batch(video_reader, batch_index):
+    frames = video_reader.get_batch(batch_index).asnumpy()
+    return frames
+class WebVid10M(Dataset):
+    def __init__(
+            self,
+            csv_path, video_folder,
+            sample_size=256, sample_stride=4, sample_n_frames=16,
+            enable_bucket=False, enable_inpaint=False, is_image=False,
+        ):
+        print(f"loading annotations from {csv_path} ...")
+        with open(csv_path, 'r') as csvfile:
+            self.dataset = list(csv.DictReader(csvfile))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.enable_bucket   = enable_bucket
+        self.enable_inpaint  = enable_inpaint
+        self.is_image        = is_image
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize(sample_size[0]),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+        video_dir    = os.path.join(self.video_folder, f"{videoid}.mp4")
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        if not self.is_image:
+            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        else:
+            batch_index = [random.randint(0, video_length - 1)]
+        if not self.enable_bucket:
+            pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
+            pixel_values = pixel_values / 255.
+            del video_reader
+        else:
+            pixel_values = video_reader.get_batch(batch_index).asnumpy()
+        if self.is_image:
+            pixel_values = pixel_values[0]
+        return pixel_values, name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                print("Error info:", e)
+                idx = random.randint(0, self.length-1)
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        if self.enable_inpaint:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample = dict(pixel_values=pixel_values, mask_pixel_values=mask_pixel_values, mask=mask, text=name)
+        else:
+            sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+class VideoDataset(Dataset):
+    def __init__(
+        self,
+        json_path, video_folder=None,
+        sample_size=256, sample_stride=4, sample_n_frames=16,
+        enable_bucket=False, enable_inpaint=False
+    ):
+        print(f"loading annotations from {json_path} ...")
+        self.dataset = json.load(open(json_path, 'r'))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.enable_bucket   = enable_bucket
+        self.enable_inpaint  = enable_inpaint
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose(
+            [
+                transforms.Resize(sample_size[0]),
+                transforms.CenterCrop(sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_id, name = video_dict['file_path'], video_dict['text']
+        if self.video_folder is None:
+            video_dir = video_id
+        else:
+            video_dir = os.path.join(self.video_folder, video_id)
+        with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+            video_length = len(video_reader)
+            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+            try:
+                sample_args = (video_reader, batch_index)
+                pixel_values = func_timeout(
+                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                )
+            except FunctionTimedOut:
+                raise ValueError(f"Read {idx} timeout.")
+            except Exception as e:
+                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+            if not self.enable_bucket:
+                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                pixel_values = pixel_values / 255.
+                del video_reader
+            else:
+                pixel_values = pixel_values
+            return pixel_values, name
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                print("Error info:", e)
+                idx = random.randint(0, self.length-1)
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        if self.enable_inpaint:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample = dict(pixel_values=pixel_values, mask_pixel_values=mask_pixel_values, mask=mask, text=name)
+        else:
+            sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+if __name__ == "__main__":
+    if 1:
+        dataset = VideoDataset(
+            json_path="/home/zhoumo.xjq/disk3/datasets/webvidval/results_2M_val.json",
+            sample_size=256,
+            sample_stride=4, sample_n_frames=16,
+        )
+    if 0:
+        dataset = WebVid10M(
+            csv_path="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv",
+            video_folder="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val",
+            sample_size=256,
+            sample_stride=4, sample_n_frames=16,
+            is_image=False,
+        )
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
+    for idx, batch in enumerate(dataloader):
+        print(batch["pixel_values"].shape, len(batch["text"]))

videox_fun/dist/__init__.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import importlib.util
+from .cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
+from .fsdp import shard_model
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    get_world_group, init_distributed_environment,
+                    initialize_model_parallel, set_multi_gpus_devices,
+                    xFuserLongContextAttention)
+from .wan_xfuser import usp_attn_forward, usp_attn_s2v_forward
+from .qwen_xfuser import QwenImageMultiGPUsAttnProcessor2_0
+from .flux_xfuser import FluxMultiGPUsAttnProcessor2_0
+# The pai_fuser is an internally developed acceleration package, which can be used on PAI.
+if importlib.util.find_spec("paifuser") is not None:
+    # --------------------------------------------------------------- #
+    #   The simple_wrapper is used to solve the problem
+    #   about conflicts between cython and torch.compile
+    # --------------------------------------------------------------- #
+    def simple_wrapper(func):
+        def inner(*args, **kwargs):
+            return func(*args, **kwargs)
+        return inner
+    # --------------------------------------------------------------- #
+    #   Sparse Attention Kernel
+    # --------------------------------------------------------------- #
+    from paifuser.models import parallel_magvit_vae
+    from paifuser.ops import wan_usp_sparse_attention_wrapper
+    from . import wan_xfuser
+    # --------------------------------------------------------------- #
+    #   Sparse Attention
+    # --------------------------------------------------------------- #
+    usp_sparse_attn_wrap_forward = simple_wrapper(wan_usp_sparse_attention_wrapper()(wan_xfuser.usp_attn_forward))
+    wan_xfuser.usp_attn_forward = usp_sparse_attn_wrap_forward
+    usp_attn_forward = usp_sparse_attn_wrap_forward
+    print("Import PAI VAE Turbo and Sparse Attention")
+    # --------------------------------------------------------------- #
+    #   Fast Rope Kernel
+    # --------------------------------------------------------------- #
+    import types
+    import torch
+    from paifuser.ops import (ENABLE_KERNEL, usp_fast_rope_apply_qk,
+                                     usp_rope_apply_real_qk)
+    def deepcopy_function(f):
+        return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__)
+    local_rope_apply_qk = deepcopy_function(wan_xfuser.rope_apply_qk)
+    if ENABLE_KERNEL:
+        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
+            if torch.is_grad_enabled():
+                return local_rope_apply_qk(q, k, grid_sizes, freqs)
+            else:
+                return usp_fast_rope_apply_qk(q, k, grid_sizes, freqs)
+    else:
+        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
+            return usp_rope_apply_real_qk(q, k, grid_sizes, freqs)
+    wan_xfuser.rope_apply_qk = adaptive_fast_usp_rope_apply_qk
+    rope_apply_qk = adaptive_fast_usp_rope_apply_qk
+    print("Import PAI Fast rope")

videox_fun/dist/cogvideox_xfuser.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import apply_rotary_emb
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+class CogVideoXMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if xFuserLongContextAttention is not None:
+            try:
+                self.hybrid_seq_parallel_attn = xFuserLongContextAttention()
+            except Exception:
+                self.hybrid_seq_parallel_attn = None
+        else:
+            self.hybrid_seq_parallel_attn = None
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+        if self.hybrid_seq_parallel_attn is None:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+            hidden_states = hidden_states
+        else:
+            img_q = query[:, :, text_seq_length:].transpose(1, 2)
+            txt_q = query[:, :, :text_seq_length].transpose(1, 2)
+            img_k = key[:, :, text_seq_length:].transpose(1, 2)
+            txt_k = key[:, :, :text_seq_length].transpose(1, 2)
+            img_v = value[:, :, text_seq_length:].transpose(1, 2)
+            txt_v = value[:, :, :text_seq_length].transpose(1, 2)
+            hidden_states = self.hybrid_seq_parallel_attn(
+                None,
+                img_q, img_k, img_v, dropout_p=0.0, causal=False,
+                joint_tensor_query=txt_q,
+                joint_tensor_key=txt_k,
+                joint_tensor_value=txt_v,
+                joint_strategy='front',
+            ).transpose(1, 2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states

videox_fun/dist/flux_xfuser.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from .fuser import xFuserLongContextAttention
+def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+    return query, key, value, encoder_query, encoder_key, encoder_value
+def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+    sequence_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if sequence_dim == 2:
+            cos = cos[None, None, :, :]
+            sin = sin[None, None, :, :]
+        elif sequence_dim == 1:
+            cos = cos[None, :, None, :]
+            sin = sin[None, :, None, :]
+        else:
+            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class FluxMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("FluxMultiGPUsAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: "FluxAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        text_seq_len: int = None,
+    ) -> torch.FloatTensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+            if image_rotary_emb is not None:
+                query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+                key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+            text_seq_len = encoder_query.shape[1]
+            txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
+            img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
+        else:
+            if image_rotary_emb is not None:
+                query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+                key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+            txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
+            img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
+        half_dtypes = (torch.float16, torch.bfloat16)
+        def half(x):
+            return x if x.dtype in half_dtypes else x.to(dtype)
+        hidden_states = xFuserLongContextAttention()(
+            None,
+            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
+            joint_tensor_query=half(txt_query) if txt_query is not None else None,
+            joint_tensor_key=half(txt_key) if txt_key is not None else None,
+            joint_tensor_value=half(txt_value) if txt_value is not None else None,
+            joint_strategy='front',
+        )
+        # Reshape back
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(img_query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

videox_fun/dist/fsdp.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyied from https://github.com/Wan-Video/Wan2.1/blob/main/wan/distributed/fsdp.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+    module_to_wrapper=None,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks if module_to_wrapper is None else module_to_wrapper),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

videox_fun/dist/fuser.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import importlib.util
+import torch
+import torch.distributed as dist
+try:
+    # The pai_fuser is an internally developed acceleration package, which can be used on PAI.
+    if importlib.util.find_spec("paifuser") is not None:
+        import paifuser
+        from paifuser.xfuser.core.distributed import (
+            get_sequence_parallel_rank, get_sequence_parallel_world_size,
+            get_sp_group, get_world_group, init_distributed_environment,
+            initialize_model_parallel)
+        from paifuser.xfuser.core.long_ctx_attention import \
+            xFuserLongContextAttention
+        print("Import PAI DiT Turbo")
+    else:
+        import xfuser
+        from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                             get_sequence_parallel_world_size,
+                                             get_sp_group, get_world_group,
+                                             init_distributed_environment,
+                                             initialize_model_parallel)
+        from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+        print("Xfuser import sucessful")
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    get_world_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+def set_multi_gpus_devices(ulysses_degree, ring_degree, classifier_free_guidance_degree=1):
+    if ulysses_degree > 1 or ring_degree > 1 or classifier_free_guidance_degree > 1:
+        if get_sp_group is None:
+            raise RuntimeError("xfuser is not installed.")
+        dist.init_process_group("nccl")
+        print('parallel inference enabled: ulysses_degree=%d ring_degree=%d classifier_free_guidance_degree=% rank=%d world_size=%d' % (
+            ulysses_degree, ring_degree, classifier_free_guidance_degree, dist.get_rank(),
+            dist.get_world_size()))
+        assert dist.get_world_size() == ring_degree * ulysses_degree * classifier_free_guidance_degree, \
+                    "number of GPUs(%d) should be equal to ring_degree * ulysses_degree * classifier_free_guidance_degree." % dist.get_world_size()
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(sequence_parallel_degree=ring_degree * ulysses_degree,
+                classifier_free_guidance_degree=classifier_free_guidance_degree,
+                ring_degree=ring_degree,
+                ulysses_degree=ulysses_degree)
+        # device = torch.device("cuda:%d" % dist.get_rank())
+        device = torch.device(f"cuda:{get_world_group().local_rank}")
+        print('rank=%d device=%s' % (get_world_group().rank, str(device)))
+    else:
+        device = "cuda"
+    return device

videox_fun/dist/qwen_xfuser.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import functools
+import glob
+import json
+import math
+import os
+import types
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
+                             scale_lora_layers, unscale_lora_layers)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch import nn
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+def apply_rotary_emb_qwen(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class QwenImageMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,  # Image stream
+        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
+        encoder_hidden_states_mask: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if encoder_hidden_states is None:
+            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
+        seq_txt = encoder_hidden_states.shape[1]
+        # Compute QKV for image stream (sample projections)
+        img_query = attn.to_q(hidden_states)
+        img_key = attn.to_k(hidden_states)
+        img_value = attn.to_v(hidden_states)
+        # Compute QKV for text stream (context projections)
+        txt_query = attn.add_q_proj(encoder_hidden_states)
+        txt_key = attn.add_k_proj(encoder_hidden_states)
+        txt_value = attn.add_v_proj(encoder_hidden_states)
+        # Reshape for multi-head attention
+        img_query = img_query.unflatten(-1, (attn.heads, -1))
+        img_key = img_key.unflatten(-1, (attn.heads, -1))
+        img_value = img_value.unflatten(-1, (attn.heads, -1))
+        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
+        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
+        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
+        # Apply QK normalization
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if attn.norm_added_q is not None:
+            txt_query = attn.norm_added_q(txt_query)
+        if attn.norm_added_k is not None:
+            txt_key = attn.norm_added_k(txt_key)
+        # Apply RoPE
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
+            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
+            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
+        # Concatenate for joint attention
+        # Order: [text, image]
+        # joint_query = torch.cat([txt_query, img_query], dim=1)
+        # joint_key = torch.cat([txt_key, img_key], dim=1)
+        # joint_value = torch.cat([txt_value, img_value], dim=1)
+        half_dtypes = (torch.float16, torch.bfloat16)
+        def half(x):
+            return x if x.dtype in half_dtypes else x.to(dtype)
+        joint_hidden_states = xFuserLongContextAttention()(
+            None,
+            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
+            joint_tensor_query=half(txt_query),
+            joint_tensor_key=half(txt_key),
+            joint_tensor_value=half(txt_value),
+            joint_strategy='front',
+        )
+        # Reshape back
+        joint_hidden_states = joint_hidden_states.flatten(2, 3)
+        joint_hidden_states = joint_hidden_states.to(img_query.dtype)
+        # Split attention outputs back
+        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
+        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
+        # Apply output projections
+        img_attn_output = attn.to_out[0](img_attn_output)
+        if len(attn.to_out) > 1:
+            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
+        txt_attn_output = attn.to_add_out(txt_attn_output)
+        return img_attn_output, txt_attn_output

videox_fun/dist/wan_xfuser.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+import torch.cuda.amp as amp
+from .fuser import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, get_sp_group,
+                    init_distributed_environment, initialize_model_parallel,
+                    xFuserLongContextAttention)
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast(enabled=False)
+@torch.compiler.disable()
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float32).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+        dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output)
+def rope_apply_qk(q, k, grid_sizes, freqs):
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    return q, k
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16,
+                     t=0):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q, k = rope_apply_qk(q, k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x
+@amp.autocast(enabled=False)
+@torch.compiler.disable()
+def s2v_rope_apply(x, grid_sizes, freqs):
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # loop over samples
+    output = []
+    for i, _ in enumerate(x):
+        s = x.size(1)
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = freqs[i]
+        freqs_i_rank = pad_freqs(freqs_i, s)
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def s2v_rope_apply_qk(q, k, grid_sizes, freqs):
+    q = s2v_rope_apply(q, grid_sizes, freqs)
+    k = s2v_rope_apply(k, grid_sizes, freqs)
+    return q, k
+def usp_attn_s2v_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16,
+                     t=0):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q, k = s2v_rope_apply_qk(q, k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

videox_fun/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .pipeline_wan import WanPipeline
+from .pipeline_wan2_2 import Wan2_2Pipeline
+WanFunPipeline = WanPipeline
+Wan2_2FunPipeline = Wan2_2Pipeline
+import importlib.util
+if importlib.util.find_spec("paifuser") is not None:
+    # --------------------------------------------------------------- #
+    #   Sparse Attention
+    # --------------------------------------------------------------- #
+    from paifuser.ops import sparse_reset
+    # Wan2.1
+    WanFunPipeline.__call__ = sparse_reset(WanFunPipeline.__call__)
+    WanPipeline.__call__ = sparse_reset(WanPipeline.__call__)
+    # Wan2.2
+    Wan2_2FunPipeline.__call__ = sparse_reset(Wan2_2FunPipeline.__call__)
+    Wan2_2Pipeline.__call__ = sparse_reset(Wan2_2Pipeline.__call__)

videox_fun/pipeline/pipeline_wan.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from ..models import (AutoencoderKLWan, AutoTokenizer,
+                              WanT5EncoderModel, WanTransformer3DModel)
+from ..utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
+                                get_sampling_sigmas)
+from ..utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        pass
+        ```
+"""
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class WanPipelineOutput(BaseOutput):
+    r"""
+    Output class for Wan pipelines.
+    Args:
+        videos: full decoded video tensor
+        ground_videos: decoded grounding segment (optional)
+        edit_videos: decoded edited segment (optional)
+    """
+    videos: torch.Tensor
+    ground_videos: Optional[torch.Tensor] = None
+    edit_videos: Optional[torch.Tensor] = None
+class WanPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using Wan.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    """
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: WanT5EncoderModel,
+        vae: AutoencoderKLWan,
+        transformer: WanTransformer3DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae.spatial_compression_ratio)
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        seq_lens = prompt_attention_mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae.temporal_compression_ratio + 1,
+            height // self.vae.spatial_compression_ratio,
+            width // self.vae.spatial_compression_ratio,
+        )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def prepare_video_latents(
+        self,
+        video: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = None,
+        generator: torch.Generator = None,
+        condition_count: int = None,
+        latents: torch.Tensor = None,
+        timestep: torch.Tensor = None,
+    ):
+        video = video.to(device=device, dtype=dtype)
+        num_latent_frames = (video.shape[2] - 1) // self.vae.temporal_compression_ratio + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            height // self.vae.spatial_compression_ratio,
+            width // self.vae.spatial_compression_ratio,
+        )
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        video_latents = []
+        print('video',video.shape)
+        for i in range(video.shape[0]):
+            # 假设 self.vae.encode 返回的是 (LatentDistribution, …)
+            latent_dist = self.vae.encode(video[i : i + 1])[0]
+            latent = latent_dist.mode()          # 直接取 mode，不做 mean/std
+            video_latents.append(latent)
+        init_latents = torch.cat(video_latents, dim=0)  # (B, C, T, H', W')
+        # 再往前 condition_count 帧注入随机 noise
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        init_latents[:, :, condition_count:, :, :] = noise[:, :, condition_count:, :, :]
+        # 现在可以正确调用 add_noise
+        # init_latents[:, :, condition_count:, :, :] = self.scheduler.add_noise(
+        #     init_latents[:, :, condition_count:, :, :],
+        #     noise[:, :, condition_count:, :, :],
+        #     timestep
+        # )
+        # print('init_latents shape',init_latents.shape)
+        return init_latents
+    def prepare_video_latents_new(
+        self,
+        video: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = None,
+        generator: torch.Generator = None,
+        condition_count: int = None,
+        latents: torch.Tensor = None,
+        timestep: torch.Tensor = None,
+    ):
+        video = video.to(device=device, dtype=dtype)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        video_latents = []
+        print('video',video.shape)
+        for i in range(video.shape[0]):
+            # 假设 self.vae.encode 返回的是 (LatentDistribution, …)
+            latent_dist = self.vae.encode(video[i : i + 1])[0]
+            latent = latent_dist.mode()          # 直接取 mode，不做 mean/std
+            video_latents.append(latent)
+        org_latents = torch.cat(video_latents, dim=0)  # (B, C, T, H', W')
+        print('org_latents',org_latents.shape)
+        # 再往后 condition_count 帧注入随机 noise，shape和org_latents一样
+        noise = randn_tensor(org_latents.shape, generator=generator, device=device, dtype=dtype)
+        print('noise',noise.shape)
+        init_latents = torch.cat([org_latents, noise], dim=2)
+        print('init_latents',init_latents.shape)
+        return init_latents
+    def prepare_cot_video_latents(
+        self,
+        video: torch.Tensor,
+        reasoning_latent_count: int = 1,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = None,
+        generator: torch.Generator = None,
+        condition_count: int = None,
+        latents: torch.Tensor = None,
+        timestep: torch.Tensor = None,
+    ):
+        video = video.to(device=device, dtype=dtype)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        video_latents = []
+        #print('video',video.shape)
+        for i in range(video.shape[0]):
+            # 假设 self.vae.encode 返回的是 (LatentDistribution, …)
+            latent_dist = self.vae.encode(video[i : i + 1])[0]
+            latent = latent_dist.mode()          # 直接取 mode，不做 mean/std
+            video_latents.append(latent)
+        org_latents = torch.cat(video_latents, dim=0)  # (B, C, T, H', W')
+        print('org_latents',org_latents.shape)
+        batch_size, num_channels_latents, num_frames_latent, height_latent, width_latent = org_latents.shape
+        tgt_frames = num_frames_latent + reasoning_latent_count
+        noise_latents_shape = (batch_size, num_channels_latents, tgt_frames, height_latent, width_latent)
+        # 再往后 condition_count 帧注入随机 noise，shape和org_latents一样
+        noise = randn_tensor(noise_latents_shape, generator=generator, device=device, dtype=dtype)
+        print('noise',noise.shape)
+        init_latents = torch.cat([org_latents, noise], dim=2)
+        print('init_latents',init_latents.shape)
+        return init_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        frames = self.vae.decode(latents.to(self.vae.dtype)).sample
+        frames = (frames / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.cpu().float().numpy()
+        return frames
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        video: Union[torch.FloatTensor] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        source_frames: int = 33,
+        reasoning_frames: int = 4,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "numpy",
+        return_dict: bool = False,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        comfyui_progressbar: bool = False,
+        shift: int = 5,
+        repeat_rope: bool = True,
+        cot: bool = False,
+    ) -> Union[WanPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+        Examples:
+        Returns:
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        weight_dtype = self.text_encoder.dtype
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            in_prompt_embeds = negative_prompt_embeds + prompt_embeds
+        else:
+            in_prompt_embeds = prompt_embeds
+        # 4. Prepare timesteps
+        if isinstance(self.scheduler, FlowMatchEulerDiscreteScheduler):
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps, mu=1)
+        elif isinstance(self.scheduler, FlowUniPCMultistepScheduler):
+            self.scheduler.set_timesteps(num_inference_steps, device=device, shift=shift)
+            timesteps = self.scheduler.timesteps
+        elif isinstance(self.scheduler, FlowDPMSolverMultistepScheduler):
+            sampling_sigmas = get_sampling_sigmas(num_inference_steps, shift)
+            timesteps, _ = retrieve_timesteps(
+                self.scheduler,
+                device=device,
+                sigmas=sampling_sigmas)
+        else:
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        if comfyui_progressbar:
+            from comfy.utils import ProgressBar
+            pbar = ProgressBar(num_inference_steps + 1)
+        # compute latent source length consistent with training: (F-1)//ratio + 1, or 1 when F==1
+        compression_ratio = getattr(self.vae, "temporal_compression_ratio", 4)
+        condition_count = 1 if source_frames == 1 else (source_frames - 1) // compression_ratio + 1
+        # 5. Prepare latents (unified across org/repeat/cot)
+        latent_channels = self.transformer.config.in_channels
+        if cot:
+        # latent grounding segment length from pixel-space reasoning_frames (used only when cot=True)
+            ground_latent_count = 1 if reasoning_frames <= 1 else (reasoning_frames - 1) // compression_ratio + 1
+            print('ground_latent_count',ground_latent_count)
+            latents = self.prepare_cot_video_latents(
+                video,
+                ground_latent_count,
+                batch_size,
+                latent_channels,
+                height,
+                width,
+                weight_dtype,
+                device,
+                generator,
+                condition_count,
+                latents,
+            )
+        elif repeat_rope:
+            latents = self.prepare_video_latents_new(
+                video,
+                batch_size,
+                latent_channels,
+                height,
+                width,
+                weight_dtype,
+                device,
+                generator,
+                condition_count,
+                latents,
+            )
+        else:
+            latents = self.prepare_video_latents_new(
+                video,
+                batch_size,
+                latent_channels,
+                height,
+                width,
+                weight_dtype,
+                device,
+                generator,
+                condition_count,
+                latents,
+            )
+        if comfyui_progressbar:
+            pbar.update(1)
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Get actual latent dimensions (consistent with training)
+        #print('latents',latents.shape)
+        bsz, channel, actual_num_frames, actual_height, actual_width = latents.size()
+        target_shape = (self.vae.latent_channels, actual_num_frames, actual_height, actual_width)
+        #print('target_shape',target_shape)
+        seq_len = math.ceil((target_shape[2] * target_shape[3]) / (self.transformer.config.patch_size[1] * self.transformer.config.patch_size[2]) * target_shape[1])
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self.transformer.num_inference_steps = num_inference_steps
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                self.transformer.current_steps = i
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                if hasattr(self.scheduler, "scale_model_input"):
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # predict noise model_output
+                with torch.cuda.amp.autocast(dtype=weight_dtype), torch.cuda.device(device=device):
+                    # frame_split_indices enables repeat temporal RoPE for paired (src+tgt) inputs
+                    frame_split_indices = None
+                    ground_frame_indices = None
+                    if repeat_rope and video is not None:
+                        frame_split_indices = [condition_count] * latent_model_input.shape[0]
+                        if cot:
+                            # grounding frames should use temporal RoPE position 0
+                            ground_frame_indices = [
+                                (condition_count, condition_count + ground_latent_count)
+                            ] * latent_model_input.shape[0]
+                        # print('ground_frame_indices',ground_frame_indices)
+                        # print('frame_split_indices',frame_split_indices)
+                    noise_pred = self.transformer(
+                        x=latent_model_input,
+                        context=in_prompt_embeds,
+                        t=timestep,
+                        seq_len=seq_len,
+                        frame_split_indices=frame_split_indices,
+                        ground_frame_indices=ground_frame_indices,
+                    )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                ######source video no noise pred################
+                noise_pred[:, :, :condition_count] = 0
+                ######source video no noise pred################
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if comfyui_progressbar:
+                    pbar.update(1)
+        # Optionally decode outputs. For cot=True, segment into src/ground/edit; otherwise decode whole latents
+        ground_video = None
+        edit_video = None
+        if cot:
+            if output_type == "numpy":
+                ground_start = condition_count
+                ground_end = condition_count + ground_latent_count
+                src_lat = latents[:, :, :ground_start] if ground_start > 0 else None
+                ground_lat = latents[:, :, ground_start:ground_end] if ground_end > ground_start and ground_start < latents.shape[2] else None
+                edit_lat = latents[:, :, ground_end:] if ground_end < latents.shape[2] else None
+                parts = []
+                ## only ground and edit
+                if ground_lat is not None and ground_lat.shape[2] > 0:
+                    ground_video = self.decode_latents(ground_lat)
+                    parts.append(ground_video)
+                if edit_lat is not None and edit_lat.shape[2] > 0:
+                    edit_video = self.decode_latents(edit_lat)
+                    parts.append(edit_video)
+                print('ground_video',ground_video.shape, 'edit_video',edit_video.shape)
+                video = np.concatenate(parts, axis=2)
+        else:
+            # org/repeat: split by condition_count -> src + edit, then temporal concat
+            if output_type == "numpy":
+                src_lat = latents[:, :, :condition_count] if condition_count > 0 else None
+                edit_lat = latents[:, :, condition_count:] if condition_count < latents.shape[2] else None
+                ## only decode edit video
+                if edit_lat is not None and edit_lat.shape[2] > 0:
+                    edit_video = self.decode_latents(edit_lat)
+                video = edit_video
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            if isinstance(video, np.ndarray):
+                video = torch.from_numpy(video)
+            if ground_video is not None and isinstance(ground_video, np.ndarray):
+                ground_video = torch.from_numpy(ground_video)
+            if edit_video is not None and isinstance(edit_video, np.ndarray):
+                edit_video = torch.from_numpy(edit_video)
+        return WanPipelineOutput(videos=video, ground_videos=ground_video, edit_videos=edit_video)

videox_fun/pipeline/pipeline_wan2_2.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from ..models import (AutoencoderKLWan, AutoTokenizer,
+                              WanT5EncoderModel, Wan2_2Transformer3DModel)
+from ..utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
+                                get_sampling_sigmas)
+from ..utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        pass
+        ```
+"""
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class WanPipelineOutput(BaseOutput):
+    r"""
+    Output class for CogVideo pipelines.
+    Args:
+        video (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+    videos: torch.Tensor
+class Wan2_2Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using Wan.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    """
+    _optional_components = ["transformer_2"]
+    model_cpu_offload_seq = "text_encoder->transformer_2->transformer->vae"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: WanT5EncoderModel,
+        vae: AutoencoderKLWan,
+        transformer: Wan2_2Transformer3DModel,
+        transformer_2: Wan2_2Transformer3DModel = None,
+        scheduler: FlowMatchEulerDiscreteScheduler = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer,
+            transformer_2=transformer_2, scheduler=scheduler
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae.spatial_compression_ratio)
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        seq_lens = prompt_attention_mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        return [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae.temporal_compression_ratio + 1,
+            height // self.vae.spatial_compression_ratio,
+            width // self.vae.spatial_compression_ratio,
+        )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        frames = self.vae.decode(latents.to(self.vae.dtype)).sample
+        frames = (frames / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.cpu().float().numpy()
+        return frames
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "numpy",
+        return_dict: bool = False,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        boundary: float = 0.875,
+        comfyui_progressbar: bool = False,
+        shift: int = 5,
+    ) -> Union[WanPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+        Examples:
+        Returns:
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        weight_dtype = self.text_encoder.dtype
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            in_prompt_embeds = negative_prompt_embeds + prompt_embeds
+        else:
+            in_prompt_embeds = prompt_embeds
+        # 4. Prepare timesteps
+        if isinstance(self.scheduler, FlowMatchEulerDiscreteScheduler):
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps, mu=1)
+        elif isinstance(self.scheduler, FlowUniPCMultistepScheduler):
+            self.scheduler.set_timesteps(num_inference_steps, device=device, shift=shift)
+            timesteps = self.scheduler.timesteps
+        elif isinstance(self.scheduler, FlowDPMSolverMultistepScheduler):
+            sampling_sigmas = get_sampling_sigmas(num_inference_steps, shift)
+            timesteps, _ = retrieve_timesteps(
+                self.scheduler,
+                device=device,
+                sigmas=sampling_sigmas)
+        else:
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        if comfyui_progressbar:
+            from comfy.utils import ProgressBar
+            pbar = ProgressBar(num_inference_steps + 1)
+        # 5. Prepare latents
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            weight_dtype,
+            device,
+            generator,
+            latents,
+        )
+        if comfyui_progressbar:
+            pbar.update(1)
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        target_shape = (self.vae.latent_channels, (num_frames - 1) // self.vae.temporal_compression_ratio + 1, width // self.vae.spatial_compression_ratio, height // self.vae.spatial_compression_ratio)
+        seq_len = math.ceil((target_shape[2] * target_shape[3]) / (self.transformer.config.patch_size[1] * self.transformer.config.patch_size[2]) * target_shape[1])
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self.transformer.num_inference_steps = num_inference_steps
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                self.transformer.current_steps = i
+                if self.interrupt:
+                    continue
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                if hasattr(self.scheduler, "scale_model_input"):
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                if self.transformer_2 is not None:
+                    if t >= boundary * self.scheduler.config.num_train_timesteps:
+                        local_transformer = self.transformer_2
+                    else:
+                        local_transformer = self.transformer
+                else:
+                    local_transformer = self.transformer
+                # predict noise model_output
+                with torch.cuda.amp.autocast(dtype=weight_dtype), torch.cuda.device(device=device):
+                    noise_pred = local_transformer(
+                        x=latent_model_input,
+                        context=in_prompt_embeds,
+                        t=timestep,
+                        seq_len=seq_len,
+                    )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    if self.transformer_2 is not None and (isinstance(self.guidance_scale, (list, tuple))):
+                        sample_guide_scale = self.guidance_scale[1] if t >= self.transformer_2.config.boundary * self.scheduler.config.num_train_timesteps else self.guidance_scale[0]
+                    else:
+                        sample_guide_scale = self.guidance_scale
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + sample_guide_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if comfyui_progressbar:
+                    pbar.update(1)
+        if output_type == "numpy":
+            video = self.decode_latents(latents)
+        elif not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            video = torch.from_numpy(video)
+        return WanPipelineOutput(videos=video)

videox_fun/ui/cogvideox_fun_ui.py ADDED Viewed

	@@ -0,0 +1,722 @@

+"""Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
+"""
+import os
+import random
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from safetensors import safe_open
+from ..data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ..models import (AutoencoderKLCogVideoX, CogVideoXTransformer3DModel,
+                      T5EncoderModel, T5Tokenizer)
+from ..pipeline import (CogVideoXFunControlPipeline,
+                        CogVideoXFunInpaintPipeline, CogVideoXFunPipeline)
+from ..utils.fp8_optimization import (convert_model_weight_to_float8, replace_parameters_by_name,
+                                    convert_weight_dtype_wrapper)
+from ..utils.lora_utils import merge_lora, unmerge_lora
+from ..utils.utils import (filter_kwargs, get_image_to_video_latent, get_image_latent, timer,
+                           get_video_to_video_latent, save_videos_grid)
+from .controller import (Fun_Controller, Fun_Controller_Client,
+                         all_cheduler_dict, css, ddpm_scheduler_dict,
+                         flow_scheduler_dict, gradio_version,
+                         gradio_version_is_above_4)
+from .ui import (create_cfg_and_seedbox,
+                 create_fake_finetune_models_checkpoints,
+                 create_fake_height_width, create_fake_model_checkpoints,
+                 create_fake_model_type, create_finetune_models_checkpoints,
+                 create_generation_method,
+                 create_generation_methods_and_video_length,
+                 create_height_width, create_model_checkpoints,
+                 create_model_type, create_prompts, create_samplers,
+                 create_ui_outputs)
+from ..dist import set_multi_gpus_devices, shard_model
+class CogVideoXFunController(Fun_Controller):
+    def update_diffusion_transformer(self, diffusion_transformer_dropdown):
+        print(f"Update diffusion transformer: {diffusion_transformer_dropdown}")
+        self.diffusion_transformer_dropdown = diffusion_transformer_dropdown
+        if diffusion_transformer_dropdown == "none":
+            return gr.update()
+        self.vae = AutoencoderKLCogVideoX.from_pretrained(
+            diffusion_transformer_dropdown,
+            subfolder="vae",
+        ).to(self.weight_dtype)
+        # Get Transformer
+        self.transformer = CogVideoXTransformer3DModel.from_pretrained(
+            diffusion_transformer_dropdown,
+            subfolder="transformer",
+            low_cpu_mem_usage=True,
+        ).to(self.weight_dtype)
+        # Get tokenizer and text_encoder
+        tokenizer = T5Tokenizer.from_pretrained(
+            diffusion_transformer_dropdown, subfolder="tokenizer"
+        )
+        text_encoder = T5EncoderModel.from_pretrained(
+            diffusion_transformer_dropdown, subfolder="text_encoder", torch_dtype=self.weight_dtype
+        )
+        # Get pipeline
+        if self.model_type == "Inpaint":
+            if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                self.pipeline = CogVideoXFunInpaintPipeline(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    vae=self.vae,
+                    transformer=self.transformer,
+                    scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                )
+            else:
+                self.pipeline = CogVideoXFunPipeline(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    vae=self.vae,
+                    transformer=self.transformer,
+                    scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                )
+        else:
+            self.pipeline = CogVideoXFunControlPipeline(
+                diffusion_transformer_dropdown,
+                vae=self.vae,
+                transformer=self.transformer,
+                scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                torch_dtype=self.weight_dtype
+            )
+        if self.ulysses_degree > 1 or self.ring_degree > 1:
+            from functools import partial
+            self.transformer.enable_multi_gpus_inference()
+            if self.fsdp_dit:
+                shard_fn = partial(shard_model, device_id=self.device, param_dtype=self.weight_dtype)
+                self.pipeline.transformer = shard_fn(self.pipeline.transformer)
+                print("Add FSDP DIT")
+            if self.fsdp_text_encoder:
+                shard_fn = partial(shard_model, device_id=self.device, param_dtype=self.weight_dtype)
+                self.pipeline.text_encoder = shard_fn(self.pipeline.text_encoder)
+                print("Add FSDP TEXT ENCODER")
+        if self.compile_dit:
+            for i in range(len(self.pipeline.transformer.transformer_blocks)):
+                self.pipeline.transformer.transformer_blocks[i] = torch.compile(self.pipeline.transformer.transformer_blocks[i])
+            print("Add Compile")
+        if self.GPU_memory_mode == "sequential_cpu_offload":
+            self.pipeline.enable_sequential_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+            convert_model_weight_to_float8(self.pipeline.transformer, exclude_module_name=[], device=self.device)
+            convert_weight_dtype_wrapper(self.pipeline.transformer, self.weight_dtype)
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload":
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_full_load_and_qfloat8":
+            convert_model_weight_to_float8(self.pipeline.transformer, exclude_module_name=[], device=self.device)
+            convert_weight_dtype_wrapper(self.pipeline.transformer, self.weight_dtype)
+            self.pipeline.to(self.device)
+        else:
+            self.pipeline.to(self.device)
+        print("Update diffusion transformer done")
+        return gr.update()
+    @timer
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        resize_method,
+        width_slider,
+        height_slider,
+        base_resolution,
+        generation_method,
+        length_slider,
+        overlap_video_length,
+        partial_video_length,
+        cfg_scale_slider,
+        start_image,
+        end_image,
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        ref_image = None,
+        enable_teacache = None,
+        teacache_threshold = None,
+        num_skip_start_steps = None,
+        teacache_offload = None,
+        cfg_skip_ratio = None,
+        enable_riflex = None,
+        riflex_k = None,
+        base_model_2_dropdown=None,
+        lora_model_2_dropdown=None,
+        fps = None,
+        is_api = False,
+    ):
+        self.clear_cache()
+        print(f"Input checking.")
+        _, comment = self.input_check(
+            resize_method, generation_method, start_image, end_image, validation_video,control_video, is_api
+        )
+        print(f"Input checking down")
+        if comment != "OK":
+            return "", comment
+        is_image = True if generation_method == "Image Generation" else False
+        if self.base_model_path != base_model_dropdown:
+            self.update_base_model(base_model_dropdown)
+        if self.lora_model_path != lora_model_dropdown:
+            self.update_lora_model(lora_model_dropdown)
+        print(f"Load scheduler.")
+        self.pipeline.scheduler = self.scheduler_dict[sampler_dropdown].from_config(self.pipeline.scheduler.config)
+        print(f"Load scheduler down.")
+        if resize_method == "Resize according to Reference":
+            print(f"Calculate height and width according to Reference.")
+            height_slider, width_slider = self.get_height_width_from_reference(
+                base_resolution, start_image, validation_video, control_video,
+            )
+        if self.lora_model_path != "none":
+            print(f"Merge Lora.")
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            print(f"Merge Lora done.")
+        if fps is None:
+            fps = 8
+        print(f"Generate seed.")
+        if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
+        else: seed_textbox = np.random.randint(0, 1e10)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed_textbox))
+        print(f"Generate seed done.")
+        try:
+            print(f"Generation.")
+            if self.model_type == "Inpaint":
+                if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                    if generation_method == "Long Video Generation":
+                        if validation_video is not None:
+                            raise gr.Error(f"Video to Video is not Support Long Video Generation now.")
+                        init_frames = 0
+                        last_frames = init_frames + partial_video_length
+                        while init_frames < length_slider:
+                            if last_frames >= length_slider:
+                                _partial_video_length = length_slider - init_frames
+                                _partial_video_length = int((_partial_video_length - 1) // self.vae.config.temporal_compression_ratio * self.vae.config.temporal_compression_ratio) + 1
+                                if _partial_video_length <= 0:
+                                    break
+                            else:
+                                _partial_video_length = partial_video_length
+                            if last_frames >= length_slider:
+                                input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+                            else:
+                                input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, None, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+                            with torch.no_grad():
+                                sample = self.pipeline(
+                                    prompt_textbox,
+                                    negative_prompt     = negative_prompt_textbox,
+                                    num_inference_steps = sample_step_slider,
+                                    guidance_scale      = cfg_scale_slider,
+                                    width               = width_slider,
+                                    height              = height_slider,
+                                    num_frames          = _partial_video_length,
+                                    generator           = generator,
+                                    video        = input_video,
+                                    mask_video   = input_video_mask,
+                                    strength     = 1,
+                                ).videos
+                            if init_frames != 0:
+                                mix_ratio = torch.from_numpy(
+                                    np.array([float(_index) / float(overlap_video_length) for _index in range(overlap_video_length)], np.float32)
+                                ).unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                                new_sample[:, :, -overlap_video_length:] = new_sample[:, :, -overlap_video_length:] * (1 - mix_ratio) + \
+                                    sample[:, :, :overlap_video_length] * mix_ratio
+                                new_sample = torch.cat([new_sample, sample[:, :, overlap_video_length:]], dim = 2)
+                                sample = new_sample
+                            else:
+                                new_sample = sample
+                            if last_frames >= length_slider:
+                                break
+                            start_image = [
+                                Image.fromarray(
+                                    (sample[0, :, _index].transpose(0, 1).transpose(1, 2) * 255).numpy().astype(np.uint8)
+                                ) for _index in range(-overlap_video_length, 0)
+                            ]
+                            init_frames = init_frames + _partial_video_length - overlap_video_length
+                            last_frames = init_frames + _partial_video_length
+                    else:
+                        if validation_video is not None:
+                            input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(validation_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), validation_video_mask=validation_video_mask, fps=fps)
+                            strength = denoise_strength
+                        else:
+                            input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                            strength = 1
+                        sample = self.pipeline(
+                            prompt_textbox,
+                            negative_prompt     = negative_prompt_textbox,
+                            num_inference_steps = sample_step_slider,
+                            guidance_scale      = cfg_scale_slider,
+                            width               = width_slider,
+                            height              = height_slider,
+                            num_frames          = length_slider if not is_image else 1,
+                            generator           = generator,
+                            video        = input_video,
+                            mask_video   = input_video_mask,
+                            strength     = strength,
+                        ).videos
+                else:
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator
+                    ).videos
+            else:
+                input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(control_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), fps=fps)
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    num_frames          = length_slider if not is_image else 1,
+                    generator           = generator,
+                    control_video = input_video,
+                ).videos
+        except Exception as e:
+            self.auto_model_clear_cache(self.pipeline.transformer)
+            self.auto_model_clear_cache(self.pipeline.text_encoder)
+            self.auto_model_clear_cache(self.pipeline.vae)
+            self.clear_cache()
+            print(f"Error. error information is {str(e)}")
+            if self.lora_model_path != "none":
+                self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if is_api:
+                return "", f"Error. error information is {str(e)}"
+            else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+        self.clear_cache()
+        # lora part
+        if self.lora_model_path != "none":
+            print(f"Unmerge Lora.")
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            print(f"Unmerge Lora done.")
+        print(f"Saving outputs.")
+        save_sample_path = self.save_outputs(
+            is_image, length_slider, sample, fps=fps
+        )
+        print(f"Saving outputs done.")
+        if is_image or length_slider == 1:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
+        else:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
+CogVideoXFunController_Host = CogVideoXFunController
+CogVideoXFunController_Client = Fun_Controller_Client
+def ui(GPU_memory_mode, scheduler_dict, compile_dit, weight_dtype, savedir_sample=None):
+    controller = CogVideoXFunController(
+        GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
+        compile_dit=compile_dit,
+        weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+    )
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun:
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_model_type(visible=True)
+            diffusion_transformer_dropdown, diffusion_transformer_refresh_button = \
+                create_model_checkpoints(controller, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button = \
+                create_finetune_models_checkpoints(controller, visible=True)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.
+                        (V1.0和V1.1支持最大49��视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation", "Long Video Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)", "Video Control (视频控制)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            model_type.change(
+                fn=controller.update_model_type,
+                inputs=[model_type],
+                outputs=[]
+            )
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return [gr.update(visible=True, maximum=85, value=49, interactive=True), gr.update(visible=False), gr.update(visible=False)]
+                elif generation_method == "Image Generation":
+                    return [gr.update(minimum=1, maximum=1, value=1, interactive=False), gr.update(visible=False), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=True, maximum=1344), gr.update(visible=True), gr.update(visible=True)]
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider, overlap_video_length, partial_video_length]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image,
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    overlap_video_length,
+                    partial_video_length,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength,
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+def ui_host(GPU_memory_mode, scheduler_dict, model_name, model_type, compile_dit, weight_dtype, savedir_sample=None):
+    controller = CogVideoXFunController_Host(
+        GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type,
+        compile_dit=compile_dit,
+        weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+    )
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_fake_model_type(visible=False)
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.
+                        (V1.0和V1.1支持最大49帧视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)", "Video Control (视频控制)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=8, maximum=85, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image,
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    overlap_video_length,
+                    partial_video_length,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength,
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+def ui_client(scheduler_dict, model_name, savedir_sample=None):
+    controller = CogVideoXFunController_Client(scheduler_dict, savedir_sample)
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller, maximum_step=50)
+                    resize_method, width_slider, height_slider, base_resolution = create_fake_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.
+                        (V1.0和V1.1支持最大49帧视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=5, maximum=85, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [image_to_video_col, video_to_video_col, start_image, end_image, validation_video, validation_video_mask]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    denoise_strength,
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller

videox_fun/ui/controller.py ADDED Viewed

	@@ -0,0 +1,514 @@

+"""Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
+"""
+import base64
+import gc
+import json
+import os
+import hashlib
+import random
+from datetime import datetime
+from glob import glob
+import cv2
+import gradio as gr
+import numpy as np
+import pkg_resources
+import requests
+import torch
+from diffusers import (CogVideoXDDIMScheduler, DDIMScheduler,
+                       DPMSolverMultistepScheduler,
+                       EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
+                       FlowMatchEulerDiscreteScheduler, PNDMScheduler)
+from omegaconf import OmegaConf
+from PIL import Image
+from safetensors import safe_open
+from ..data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ..utils.utils import save_videos_grid
+from ..utils.fm_solvers import FlowDPMSolverMultistepScheduler
+from ..utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+from ..dist import set_multi_gpus_devices
+gradio_version = pkg_resources.get_distribution("gradio").version
+gradio_version_is_above_4 = True if int(gradio_version.split('.')[0]) >= 4 else False
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+ddpm_scheduler_dict = {
+    "Euler": EulerDiscreteScheduler,
+    "Euler A": EulerAncestralDiscreteScheduler,
+    "DPM++": DPMSolverMultistepScheduler,
+    "PNDM": PNDMScheduler,
+    "DDIM": DDIMScheduler,
+    "DDIM_Origin": DDIMScheduler,
+    "DDIM_Cog": CogVideoXDDIMScheduler,
+}
+flow_scheduler_dict = {
+    "Flow": FlowMatchEulerDiscreteScheduler,
+    "Flow_Unipc": FlowUniPCMultistepScheduler,
+    "Flow_DPM++": FlowDPMSolverMultistepScheduler,
+}
+all_cheduler_dict = {**ddpm_scheduler_dict, **flow_scheduler_dict}
+class Fun_Controller:
+    def __init__(
+        self, GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
+        config_path=None, ulysses_degree=1, ring_degree=1,
+        fsdp_dit=False, fsdp_text_encoder=False, compile_dit=False,
+        weight_dtype=None, savedir_sample=None,
+    ):
+        # config dirs
+        self.basedir                    = os.getcwd()
+        self.config_dir                 = os.path.join(self.basedir, "config")
+        self.diffusion_transformer_dir  = os.path.join(self.basedir, "models", "Diffusion_Transformer")
+        self.motion_module_dir          = os.path.join(self.basedir, "models", "Motion_Module")
+        self.personalized_model_dir     = os.path.join(self.basedir, "models", "Personalized_Model")
+        if savedir_sample is None:
+            self.savedir_sample         = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        else:
+            self.savedir_sample         = savedir_sample
+        os.makedirs(self.savedir_sample, exist_ok=True)
+        self.GPU_memory_mode                = GPU_memory_mode
+        self.model_name                     = model_name
+        self.diffusion_transformer_dropdown = model_name
+        self.scheduler_dict             = scheduler_dict
+        self.model_type                 = model_type
+        if config_path is not None:
+            self.config_path            = os.path.realpath(config_path)
+            self.config                 = OmegaConf.load(config_path)
+        else:
+            self.config_path            = None
+        self.ulysses_degree             = ulysses_degree
+        self.ring_degree                = ring_degree
+        self.fsdp_dit                   = fsdp_dit
+        self.fsdp_text_encoder          = fsdp_text_encoder
+        self.compile_dit                = compile_dit
+        self.weight_dtype               = weight_dtype
+        self.device                     = set_multi_gpus_devices(self.ulysses_degree, self.ring_degree)
+        self.diffusion_transformer_list = []
+        self.motion_module_list         = []
+        self.personalized_model_list    = []
+        self.config_list                = []
+        # config models
+        self.tokenizer             = None
+        self.text_encoder          = None
+        self.vae                   = None
+        self.transformer           = None
+        self.transformer_2         = None
+        self.pipeline              = None
+        self.base_model_path       = "none"
+        self.base_model_2_path     = "none"
+        self.lora_model_path       = "none"
+        self.lora_model_2_path     = "none"
+        self.refresh_config()
+        self.refresh_diffusion_transformer()
+        self.refresh_personalized_model()
+        if model_name != None:
+            self.update_diffusion_transformer(model_name)
+    def refresh_config(self):
+        config_list = []
+        for root, dirs, files in os.walk(self.config_dir):
+            for file in files:
+                if file.endswith(('.yaml', '.yml')):
+                    full_path = os.path.join(root, file)
+                    config_list.append(full_path)
+        self.config_list = config_list
+    def refresh_diffusion_transformer(self):
+        self.diffusion_transformer_list = sorted(glob(os.path.join(self.diffusion_transformer_dir, "*/")))
+    def refresh_personalized_model(self):
+        personalized_model_list = sorted(glob(os.path.join(self.personalized_model_dir, "*.safetensors")))
+        self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
+    def update_model_type(self, model_type):
+        self.model_type = model_type
+    def update_config(self, config_dropdown):
+        self.config_path = config_dropdown
+        self.config = OmegaConf.load(config_dropdown)
+        print(f"Update config: {config_dropdown}")
+    def update_diffusion_transformer(self, diffusion_transformer_dropdown):
+        pass
+    def update_base_model(self, base_model_dropdown, is_checkpoint_2=False):
+        if not is_checkpoint_2:
+            self.base_model_path = base_model_dropdown
+        else:
+            self.base_model_2_path = base_model_dropdown
+        print(f"Update base model: {base_model_dropdown}")
+        if base_model_dropdown == "none":
+            return gr.update()
+        if self.transformer is None and not is_checkpoint_2:
+            gr.Info(f"Please select a pretrained model path.")
+            print(f"Please select a pretrained model path.")
+            return gr.update(value=None)
+        elif self.transformer_2 is None and is_checkpoint_2:
+            gr.Info(f"Please select a pretrained model path.")
+            print(f"Please select a pretrained model path.")
+            return gr.update(value=None)
+        else:
+            base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown)
+            base_model_state_dict = {}
+            with safe_open(base_model_dropdown, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    base_model_state_dict[key] = f.get_tensor(key)
+            if not is_checkpoint_2:
+                self.transformer.load_state_dict(base_model_state_dict, strict=False)
+            else:
+                self.transformer_2.load_state_dict(base_model_state_dict, strict=False)
+            print("Update base model done")
+            return gr.update()
+    def update_lora_model(self, lora_model_dropdown, is_checkpoint_2=False):
+        print(f"Update lora model: {lora_model_dropdown}")
+        if lora_model_dropdown == "none":
+            self.lora_model_path = "none"
+            return gr.update()
+        lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
+        if not is_checkpoint_2:
+            self.lora_model_path = lora_model_dropdown
+        else:
+            self.lora_model_2_path = lora_model_dropdown
+        return gr.update()
+    def clear_cache(self,):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    def auto_model_clear_cache(self, model):
+        origin_device = model.device
+        model = model.to("cpu")
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        model = model.to(origin_device)
+    def input_check(self,
+        resize_method,
+        generation_method,
+        start_image,
+        end_image,
+        validation_video,
+        control_video,
+        is_api = False,
+    ):
+        if self.transformer is None:
+            if is_api:
+                return "", f"Please select a pretrained model path."
+            else:
+                raise gr.Error(f"Please select a pretrained model path.")
+        if control_video is not None and self.model_type == "Inpaint":
+            if is_api:
+                return "", f"If specifying the control video, please set the model_type == \"Control\". "
+            else:
+                raise gr.Error(f"If specifying the control video, please set the model_type == \"Control\". ")
+        if control_video is None and self.model_type == "Control":
+            if is_api:
+                return "", f"If set the model_type == \"Control\", please specifying the control video. "
+            else:
+                raise gr.Error(f"If set the model_type == \"Control\", please specifying the control video. ")
+        if resize_method == "Resize according to Reference":
+            if start_image is None and validation_video is None and control_video is None:
+                if is_api:
+                    return "", f"Please upload an image when using \"Resize according to Reference\"."
+                else:
+                    raise gr.Error(f"Please upload an image when using \"Resize according to Reference\".")
+        if self.transformer.config.in_channels == self.vae.config.latent_channels and start_image is not None:
+            if is_api:
+                return "", f"Please select an image to video pretrained model while using image to video."
+            else:
+                raise gr.Error(f"Please select an image to video pretrained model while using image to video.")
+        if self.transformer.config.in_channels == self.vae.config.latent_channels and generation_method == "Long Video Generation":
+            if is_api:
+                return "", f"Please select an image to video pretrained model while using long video generation."
+            else:
+                raise gr.Error(f"Please select an image to video pretrained model while using long video generation.")
+        if start_image is None and end_image is not None:
+            if is_api:
+                return "", f"If specifying the ending image of the video, please specify a starting image of the video."
+            else:
+                raise gr.Error(f"If specifying the ending image of the video, please specify a starting image of the video.")
+        return "", "OK"
+    def get_height_width_from_reference(
+        self,
+        base_resolution,
+        start_image,
+        validation_video,
+        control_video,
+    ):
+        spatial_compression_ratio = self.vae.config.spatial_compression_ratio if hasattr(self.vae.config, "spatial_compression_ratio") else 8
+        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+        if self.model_type == "Inpaint":
+            if validation_video is not None:
+                original_width, original_height = Image.fromarray(cv2.VideoCapture(validation_video).read()[1]).size
+            else:
+                original_width, original_height = start_image[0].size if type(start_image) is list else Image.open(start_image).size
+        else:
+            original_width, original_height = Image.fromarray(cv2.VideoCapture(control_video).read()[1]).size
+        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+        height_slider, width_slider = [int(x / spatial_compression_ratio / 2) * spatial_compression_ratio * 2 for x in closest_size]
+        return height_slider, width_slider
+    def save_outputs(self, is_image, length_slider, sample, fps):
+        def save_results():
+            if not os.path.exists(self.savedir_sample):
+                os.makedirs(self.savedir_sample, exist_ok=True)
+            index = len([path for path in os.listdir(self.savedir_sample)]) + 1
+            prefix = str(index).zfill(8)
+            md5_hash = hashlib.md5(sample.cpu().numpy().tobytes()).hexdigest()
+            if is_image or length_slider == 1:
+                save_sample_path = os.path.join(self.savedir_sample, prefix + f"-{md5_hash}.png")
+                print(f"Saving to {save_sample_path}")
+                image = sample[0, :, 0]
+                image = image.transpose(0, 1).transpose(1, 2)
+                image = (image * 255).numpy().astype(np.uint8)
+                image = Image.fromarray(image)
+                image.save(save_sample_path)
+            else:
+                save_sample_path = os.path.join(self.savedir_sample, prefix + f"-{md5_hash}.mp4")
+                print(f"Saving to {save_sample_path}")
+                save_videos_grid(sample, save_sample_path, fps=fps)
+            return save_sample_path
+        if self.ulysses_degree * self.ring_degree > 1:
+            import torch.distributed as dist
+            if dist.get_rank() == 0:
+                save_sample_path = save_results()
+            else:
+                save_sample_path = None
+        else:
+            save_sample_path = save_results()
+        return save_sample_path
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        resize_method,
+        width_slider,
+        height_slider,
+        base_resolution,
+        generation_method,
+        length_slider,
+        overlap_video_length,
+        partial_video_length,
+        cfg_scale_slider,
+        start_image,
+        end_image,
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        enable_teacache = None,
+        teacache_threshold = None,
+        num_skip_start_steps = None,
+        teacache_offload = None,
+        cfg_skip_ratio = None,
+        enable_riflex = None,
+        riflex_k = None,
+        is_api = False,
+    ):
+        pass
+def post_to_host(
+    diffusion_transformer_dropdown,
+    base_model_dropdown, lora_model_dropdown, lora_alpha_slider,
+    prompt_textbox, negative_prompt_textbox,
+    sampler_dropdown, sample_step_slider, resize_method, width_slider, height_slider,
+    base_resolution, generation_method, length_slider, cfg_scale_slider,
+    start_image, end_image, validation_video, validation_video_mask, denoise_strength, seed_textbox,
+    ref_image = None, enable_teacache = None, teacache_threshold = None, num_skip_start_steps = None,
+    teacache_offload = None, cfg_skip_ratio = None,enable_riflex = None, riflex_k = None,
+):
+    if start_image is not None:
+        with open(start_image, 'rb') as file:
+            file_content = file.read()
+            start_image_encoded_content = base64.b64encode(file_content)
+            start_image = start_image_encoded_content.decode('utf-8')
+    if end_image is not None:
+        with open(end_image, 'rb') as file:
+            file_content = file.read()
+            end_image_encoded_content = base64.b64encode(file_content)
+            end_image = end_image_encoded_content.decode('utf-8')
+    if validation_video is not None:
+        with open(validation_video, 'rb') as file:
+            file_content = file.read()
+            validation_video_encoded_content = base64.b64encode(file_content)
+            validation_video = validation_video_encoded_content.decode('utf-8')
+    if validation_video_mask is not None:
+        with open(validation_video_mask, 'rb') as file:
+            file_content = file.read()
+            validation_video_mask_encoded_content = base64.b64encode(file_content)
+            validation_video_mask = validation_video_mask_encoded_content.decode('utf-8')
+    if ref_image is not None:
+        with open(ref_image, 'rb') as file:
+            file_content = file.read()
+            ref_image_encoded_content = base64.b64encode(file_content)
+            ref_image = ref_image_encoded_content.decode('utf-8')
+    datas = {
+        "base_model_path": base_model_dropdown,
+        "lora_model_path": lora_model_dropdown,
+        "lora_alpha_slider": lora_alpha_slider,
+        "prompt_textbox": prompt_textbox,
+        "negative_prompt_textbox": negative_prompt_textbox,
+        "sampler_dropdown": sampler_dropdown,
+        "sample_step_slider": sample_step_slider,
+        "resize_method": resize_method,
+        "width_slider": width_slider,
+        "height_slider": height_slider,
+        "base_resolution": base_resolution,
+        "generation_method": generation_method,
+        "length_slider": length_slider,
+        "cfg_scale_slider": cfg_scale_slider,
+        "start_image": start_image,
+        "end_image": end_image,
+        "validation_video": validation_video,
+        "validation_video_mask": validation_video_mask,
+        "denoise_strength": denoise_strength,
+        "seed_textbox": seed_textbox,
+        "ref_image": ref_image,
+        "enable_teacache": enable_teacache,
+        "teacache_threshold": teacache_threshold,
+        "num_skip_start_steps": num_skip_start_steps,
+        "teacache_offload": teacache_offload,
+        "cfg_skip_ratio": cfg_skip_ratio,
+        "enable_riflex": enable_riflex,
+        "riflex_k": riflex_k,
+    }
+    session = requests.session()
+    session.headers.update({"Authorization": os.environ.get("EAS_TOKEN")})
+    response = session.post(url=f'{os.environ.get("EAS_URL")}/videox_fun/infer_forward', json=datas, timeout=300)
+    outputs = response.json()
+    return outputs
+class Fun_Controller_Client:
+    def __init__(self, scheduler_dict, savedir_sample):
+        self.basedir = os.getcwd()
+        if savedir_sample is None:
+            self.savedir_sample         = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
+        else:
+            self.savedir_sample         = savedir_sample
+        os.makedirs(self.savedir_sample, exist_ok=True)
+        self.scheduler_dict = scheduler_dict
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        resize_method,
+        width_slider,
+        height_slider,
+        base_resolution,
+        generation_method,
+        length_slider,
+        cfg_scale_slider,
+        start_image,
+        end_image,
+        validation_video,
+        validation_video_mask,
+        denoise_strength,
+        seed_textbox,
+        ref_image = None,
+        enable_teacache = None,
+        teacache_threshold = None,
+        num_skip_start_steps = None,
+        teacache_offload = None,
+        cfg_skip_ratio = None,
+        enable_riflex = None,
+        riflex_k = None,
+    ):
+        is_image = True if generation_method == "Image Generation" else False
+        outputs = post_to_host(
+            diffusion_transformer_dropdown,
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider,
+            prompt_textbox, negative_prompt_textbox,
+            sampler_dropdown, sample_step_slider, resize_method, width_slider, height_slider,
+            base_resolution, generation_method, length_slider, cfg_scale_slider,
+            start_image, end_image, validation_video, validation_video_mask, denoise_strength,
+            seed_textbox, ref_image = ref_image, enable_teacache = enable_teacache, teacache_threshold = teacache_threshold,
+            num_skip_start_steps = num_skip_start_steps, teacache_offload = teacache_offload,
+            cfg_skip_ratio = cfg_skip_ratio, enable_riflex = enable_riflex, riflex_k = riflex_k,
+        )
+        try:
+            base64_encoding = outputs["base64_encoding"]
+        except:
+            return gr.Image(visible=False, value=None), gr.Video(None, visible=True), outputs["message"]
+        decoded_data = base64.b64decode(base64_encoding)
+        if not os.path.exists(self.savedir_sample):
+            os.makedirs(self.savedir_sample, exist_ok=True)
+        md5_hash = hashlib.md5(decoded_data).hexdigest()
+        index = len([path for path in os.listdir(self.savedir_sample)]) + 1
+        prefix = str(index).zfill(8)
+        if is_image or length_slider == 1:
+            save_sample_path = os.path.join(self.savedir_sample, prefix + f"-{md5_hash}.png")
+            print(f"Saving to {save_sample_path}")
+            with open(save_sample_path, "wb") as file:
+                file.write(decoded_data)
+            if gradio_version_is_above_4:
+                return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+            else:
+                return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
+        else:
+            save_sample_path = os.path.join(self.savedir_sample, prefix + f"-{md5_hash}.mp4")
+            print(f"Saving to {save_sample_path}")
+            with open(save_sample_path, "wb") as file:
+                file.write(decoded_data)
+            if gradio_version_is_above_4:
+                return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+            else:
+                return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"

videox_fun/ui/ui.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import random
+import gradio as gr
+def create_model_type(visible):
+    gr.Markdown(
+        """
+        ### Model Type.
+        """,
+        visible=visible,
+    )
+    with gr.Row():
+        model_type = gr.Dropdown(
+            label="The model type of the model",
+            choices=["Inpaint", "Control"],
+            value="Inpaint",
+            visible=visible,
+            interactive=True,
+        )
+    return model_type
+def create_fake_model_type(visible):
+    gr.Markdown(
+        """
+        ### Model Type.
+        """,
+        visible=visible,
+    )
+    with gr.Row():
+        model_type = gr.Dropdown(
+            label="The model type of the model",
+            choices=["Inpaint", "Control"],
+            value="Inpaint",
+            interactive=False,
+            visible=visible,
+        )
+    return model_type
+def create_model_checkpoints(controller, visible, default_model="none"):
+    gr.Markdown(
+        """
+        ### Model checkpoints.
+        """
+    )
+    with gr.Row(visible=visible):
+        diffusion_transformer_dropdown = gr.Dropdown(
+            label="Pretrained Model Path",
+            choices=list(set(controller.diffusion_transformer_list + [default_model])),
+            value=default_model,
+            interactive=True,
+        )
+        diffusion_transformer_dropdown.change(
+            fn=controller.update_diffusion_transformer,
+            inputs=[diffusion_transformer_dropdown],
+            outputs=[diffusion_transformer_dropdown]
+        )
+        diffusion_transformer_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+        def refresh_diffusion_transformer():
+            controller.refresh_diffusion_transformer()
+            return gr.update(choices=controller.diffusion_transformer_list)
+        diffusion_transformer_refresh_button.click(fn=refresh_diffusion_transformer, inputs=[], outputs=[diffusion_transformer_dropdown])
+    return diffusion_transformer_dropdown, diffusion_transformer_refresh_button
+def create_fake_model_checkpoints(model_name, visible):
+    gr.Markdown(
+        """
+        ### Model checkpoints.
+        """
+    )
+    with gr.Row(visible=visible):
+        diffusion_transformer_dropdown = gr.Dropdown(
+            label="Pretrained Model Path",
+            choices=[model_name],
+            value=model_name,
+            interactive=False,
+        )
+    return diffusion_transformer_dropdown
+def create_finetune_models_checkpoints(controller, visible, add_checkpoint_2=False, default_lora="none"):
+    with gr.Row(visible=visible):
+        base_model_dropdown = gr.Dropdown(
+            label="Select base Dreambooth model",
+            choices=["none"] + controller.personalized_model_list,
+            value="none",
+            interactive=True,
+        )
+        if add_checkpoint_2:
+            base_model_2_dropdown = gr.Dropdown(
+                label="Select base Dreambooth model",
+                choices=["none"] + controller.personalized_model_list,
+                value="none",
+                interactive=True,
+            )
+        lora_model_dropdown = gr.Dropdown(
+            label="Select LoRA model",
+            choices=list(set(["none"] + controller.personalized_model_list + [default_lora])),
+            value=default_lora,
+            interactive=True,
+        )
+        if add_checkpoint_2:
+            lora_model_2_dropdown = gr.Dropdown(
+            label="Select LoRA model",
+                choices=["none"] + controller.personalized_model_list,
+                value="none",
+                interactive=True,
+            )
+        lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.55, minimum=0, maximum=2, interactive=True)
+        personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+        def update_personalized_model():
+            controller.refresh_personalized_model()
+            return [
+                gr.update(choices=controller.personalized_model_list),
+                gr.update(choices=["none"] + controller.personalized_model_list)
+            ]
+        personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
+    if not add_checkpoint_2:
+        return base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button
+    else:
+        return [base_model_dropdown, base_model_2_dropdown], [lora_model_dropdown, lora_model_2_dropdown], \
+            lora_alpha_slider, personalized_refresh_button
+def create_fake_finetune_models_checkpoints(visible):
+    with gr.Row():
+        base_model_dropdown = gr.Dropdown(
+            label="Select base Dreambooth model",
+            choices=["none"],
+            value="none",
+            interactive=False,
+            visible=False
+        )
+        with gr.Column(visible=False):
+            gr.Markdown(
+                """
+                ### Minimalism is an example portrait of Lora, triggered by specific prompt words. More details can be found on [Wiki](https://github.com/aigc-apps/CogVideoX-Fun/wiki/Training-Lora).
+                """
+            )
+            with gr.Row():
+                lora_model_dropdown = gr.Dropdown(
+                    label="Select LoRA model",
+                    choices=["none"],
+                    value="none",
+                    interactive=True,
+                )
+                lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.55, minimum=0, maximum=2, interactive=True)
+    return base_model_dropdown, lora_model_dropdown, lora_alpha_slider
+def create_teacache_params(
+    enable_teacache = True,
+    teacache_threshold = 0.10,
+    num_skip_start_steps = 1,
+    teacache_offload = False,
+):
+    enable_teacache = gr.Checkbox(label="Enable TeaCache", value=enable_teacache)
+    teacache_threshold = gr.Slider(0.00, 0.25, value=teacache_threshold, step=0.01, label="TeaCache Threshold")
+    num_skip_start_steps = gr.Slider(0, 10, value=num_skip_start_steps, step=5, label="Number of Skip Start Steps")
+    teacache_offload = gr.Checkbox(label="Offload TeaCache to CPU", value=teacache_offload)
+    return enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload
+def create_cfg_skip_params(
+    cfg_skip_ratio = 0
+):
+    cfg_skip_ratio = gr.Slider(0.00, 0.50, value=cfg_skip_ratio, step=0.01, label="CFG Skip Ratio", visible=False)
+    return cfg_skip_ratio
+def create_cfg_riflex_k(
+    enable_riflex = False,
+    riflex_k = 6
+):
+    enable_riflex = gr.Checkbox(label="Enable Riflex", value=enable_riflex, visible=False)
+    riflex_k = gr.Slider(0, 10, value=riflex_k, step=1, label="Riflex Intrinsic Frequency Index", visible=False)
+    return enable_riflex, riflex_k
+def create_prompts(
+    prompt="A young woman with beautiful and clear eyes and blonde hair standing and white dress in a forest wearing a crown. She seems to be lost in thought, and the camera focuses on her face. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+    negative_prompt="Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"
+):
+    gr.Markdown(
+        """
+        ### Configs for Generation.
+        """
+    )
+    prompt_textbox = gr.Textbox(label="Prompt", lines=2, value=prompt)
+    negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value=negative_prompt)
+    return prompt_textbox, negative_prompt_textbox
+def create_samplers(controller, maximum_step=100):
+    with gr.Row():
+        sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(controller.scheduler_dict.keys()), value=list(controller.scheduler_dict.keys())[0])
+        sample_step_slider = gr.Slider(label="Sampling steps", value=50, minimum=10, maximum=maximum_step, step=1)
+    return sampler_dropdown, sample_step_slider
+def create_height_width(default_height, default_width, maximum_height, maximum_width):
+    resize_method = gr.Radio(
+        ["Generate by", "Resize according to Reference"],
+        value="Generate by",
+        show_label=False,
+    )
+    width_slider     = gr.Slider(label="Width", value=default_width, minimum=128, maximum=maximum_width, step=16)
+    height_slider    = gr.Slider(label="Height", value=default_height, minimum=128, maximum=maximum_height, step=16)
+    base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 640, 768, 896, 960, 1024], visible=False)
+    return resize_method, width_slider, height_slider, base_resolution
+def create_fake_height_width(default_height, default_width, maximum_height, maximum_width):
+    resize_method = gr.Radio(
+        ["Generate by", "Resize according to Reference"],
+        value="Generate by",
+        show_label=False,
+    )
+    width_slider     = gr.Slider(label="Width", value=default_width, minimum=128, maximum=maximum_width, step=16, interactive=False)
+    height_slider    = gr.Slider(label="Height", value=default_height, minimum=128, maximum=maximum_height, step=16, interactive=False)
+    base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 640, 768, 896, 960, 1024], interactive=False, visible=False)
+    return resize_method, width_slider, height_slider, base_resolution
+def create_generation_methods_and_video_length(
+    generation_method_options,
+    default_video_length,
+    maximum_video_length
+):
+    with gr.Group():
+        generation_method = gr.Radio(
+            generation_method_options,
+            value="Video Generation",
+            show_label=False,
+            visible=False
+        )
+        with gr.Row():
+            length_slider = gr.Slider(label="Animation length", value=default_video_length, minimum=1,   maximum=maximum_video_length,  step=4, visible=False)
+            overlap_video_length = gr.Slider(label="Overlap length", value=4, minimum=1,   maximum=4,  step=1, visible=False)
+            partial_video_length = gr.Slider(label="Partial video generation length", value=25, minimum=5,   maximum=maximum_video_length,  step=4, visible=False)
+    return generation_method, length_slider, overlap_video_length, partial_video_length
+def create_generation_method(source_method_options, prompt_textbox, support_end_image=True, support_ref_image=False, default_video=None, video_examples=None):
+    default_method = source_method_options[0] if source_method_options else "Text to Video"
+    source_method = gr.Radio(
+        source_method_options,
+        value=default_method,
+        show_label=False,
+    )
+    with gr.Column(visible = (default_method == "Image to Video")) as image_to_video_col:
+        start_image = gr.Image(
+            label="The image at the beginning of the video",  show_label=True,
+            elem_id="i2v_start", sources="upload", type="filepath",
+        )
+        template_gallery_path = ["asset/1.png", "asset/2.png", "asset/3.png", "asset/4.png", "asset/5.png"]
+        def select_template(evt: gr.SelectData):
+            text = {
+                "asset/1.png": "A brown dog is shaking its head and sitting on a light colored sofa in a comfortable room. Behind the dog, there is a framed painting on the shelf surrounded by pink flowers. The soft and warm lighting in the room creates a comfortable atmosphere.",
+                "asset/2.png": "A sailboat navigates through moderately rough seas, with waves and ocean spray visible. The sailboat features a white hull and sails, accompanied by an orange sail catching the wind. The sky above shows dramatic, cloudy formations with a sunset or sunrise backdrop, casting warm colors across the scene. The water reflects the golden light, enhancing the visual contrast between the dark ocean and the bright horizon. The camera captures the scene with a dynamic and immersive angle, showcasing the movement of the boat and the energy of the ocean.",
+                "asset/3.png": "A stunningly beautiful woman with flowing long hair stands gracefully, her elegant dress rippling and billowing in the gentle wind. Petals falling off. Her serene expression and the natural movement of her attire create an enchanting and captivating scene, full of ethereal charm.",
+                "asset/4.png": "An astronaut, clad in a full space suit with a helmet, plays an electric guitar while floating in a cosmic environment filled with glowing particles and rocky textures. The scene is illuminated by a warm light source, creating dramatic shadows and contrasts. The background features a complex geometry, similar to a space station or an alien landscape, indicating a futuristic or otherworldly setting.",
+                "asset/5.png": "Fireworks light up the evening sky over a sprawling cityscape with gothic-style buildings featuring pointed towers and clock faces. The city is lit by both artificial lights from the buildings and the colorful bursts of the fireworks. The scene is viewed from an elevated angle, showcasing a vibrant urban environment set against a backdrop of a dramatic, partially cloudy sky at dusk.",
+            }[template_gallery_path[evt.index]]
+            return template_gallery_path[evt.index], text
+        template_gallery = gr.Gallery(
+            template_gallery_path,
+            columns=5, rows=1,
+            height=140,
+            allow_preview=False,
+            container=False,
+            label="Template Examples",
+        )
+        template_gallery.select(select_template, None, [start_image, prompt_textbox])
+        with gr.Accordion("The image at the ending of the video", open=False, visible=support_end_image):
+            end_image   = gr.Image(label="The image at the ending of the video", show_label=False, elem_id="i2v_end", sources="upload", type="filepath")
+    with gr.Column(visible = (default_method == "Video to Video")) as video_to_video_col:
+        with gr.Row():
+            validation_video = gr.Video(
+                label="The video to convert",  show_label=True,
+                elem_id="v2v", sources=["upload"], value=default_video,
+            )
+        if video_examples:
+            gr.Examples(
+                examples=video_examples,
+                inputs=[validation_video, prompt_textbox] if len(video_examples[0]) > 1 else validation_video,
+                label="Video Examples"
+            )
+        # Removed Mask Accordion entirely per request or hidden. User said "mask这个不需要"
+        # validation_video_mask = gr.Image(
+        #     label="The mask of the video to inpaint",
+        #     show_label=False, elem_id="v2v_mask", sources="upload", type="filepath",
+        #     visible=False
+        # )
+        validation_video_mask = gr.Image(visible=False, value=None)
+        # Denoise strength default 1.0, hidden
+        denoise_strength = gr.Slider(label="Denoise strength", value=1.00, minimum=0.10, maximum=1.00, step=0.01, visible=False)
+    with gr.Column(visible = False) as control_video_col:
+        gr.Markdown(
+            """
+            Demo pose control video can be downloaded here [URL](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/cogvideox_fun/asset/v1.1/pose.mp4).
+            """
+        )
+        control_video = gr.Video(
+            label="The control video",  show_label=True,
+            elem_id="v2v_control", sources="upload",
+        )
+        ref_image = gr.Image(
+            label="The reference image for control video",  show_label=True,
+            elem_id="ref_image", sources="upload", type="filepath", visible=support_ref_image
+        )
+    return image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image
+def create_cfg_and_seedbox(gradio_version_is_above_4):
+    # cfg default 6, hidden
+    cfg_scale_slider  = gr.Slider(label="CFG Scale",        value=6.0, minimum=0,   maximum=20, visible=False)
+    with gr.Row():
+        seed_textbox = gr.Textbox(label="Seed", value=43)
+        seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+        seed_button.click(
+            fn=lambda: gr.Textbox(value=random.randint(1, 1e8)) if gradio_version_is_above_4 else gr.Textbox.update(value=random.randint(1, 1e8)),
+            inputs=[],
+            outputs=[seed_textbox]
+        )
+    return cfg_scale_slider, seed_textbox, seed_button
+def create_ui_outputs():
+    with gr.Column():
+        result_image = gr.Image(label="Generated Image", interactive=False, visible=False)
+        result_video = gr.Video(label="Generated Animation", interactive=False)
+        infer_progress = gr.Textbox(
+            label="Generation Info",
+            value="No task currently",
+            interactive=False
+    )
+    return result_image, result_video, infer_progress
+def create_config(controller):
+    gr.Markdown(
+        """
+        ### Config Path (配置文件路径)
+        """
+    )
+    with gr.Row():
+        config_dropdown = gr.Dropdown(
+            label="Config Path",
+            choices=controller.config_list,
+            value=controller.config_path,
+            interactive=True,
+        )
+        config_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+        def refresh_config():
+            controller.refresh_config()
+            return gr.update(choices=controller.config_list)
+        config_refresh_button.click(fn=refresh_config, inputs=[], outputs=[config_dropdown])
+    return config_dropdown, config_refresh_button

videox_fun/ui/wan2_2_fun_ui.py ADDED Viewed

	@@ -0,0 +1,803 @@

+"""Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
+"""
+import os
+import random
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from PIL import Image
+from safetensors import safe_open
+from ..data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ..dist import set_multi_gpus_devices, shard_model
+from ..models import (AutoencoderKLWan, AutoencoderKLWan3_8, AutoTokenizer,
+                      CLIPModel, Wan2_2Transformer3DModel, WanT5EncoderModel)
+from ..models.cache_utils import get_teacache_coefficients
+from ..pipeline import Wan2_2FunControlPipeline, Wan2_2FunPipeline, Wan2_2FunInpaintPipeline
+from ..utils.fp8_optimization import (convert_model_weight_to_float8,
+                                      convert_weight_dtype_wrapper,
+                                      replace_parameters_by_name)
+from ..utils.lora_utils import merge_lora, unmerge_lora
+from ..utils.utils import (filter_kwargs, get_image_latent,
+                           get_image_to_video_latent,
+                           get_video_to_video_latent, save_videos_grid, timer)
+from .controller import (Fun_Controller, Fun_Controller_Client,
+                         all_cheduler_dict, css, ddpm_scheduler_dict,
+                         flow_scheduler_dict, gradio_version,
+                         gradio_version_is_above_4)
+from .ui import (create_cfg_and_seedbox, create_cfg_riflex_k,
+                 create_cfg_skip_params, create_config,
+                 create_fake_finetune_models_checkpoints,
+                 create_fake_height_width, create_fake_model_checkpoints,
+                 create_fake_model_type, create_finetune_models_checkpoints,
+                 create_generation_method,
+                 create_generation_methods_and_video_length,
+                 create_height_width, create_model_checkpoints,
+                 create_model_type, create_prompts, create_samplers,
+                 create_teacache_params, create_ui_outputs)
+class Wan2_2_Fun_Controller(Fun_Controller):
+    def update_diffusion_transformer(self, diffusion_transformer_dropdown):
+        print(f"Update diffusion transformer: {diffusion_transformer_dropdown}")
+        self.model_name = diffusion_transformer_dropdown
+        self.diffusion_transformer_dropdown = diffusion_transformer_dropdown
+        if diffusion_transformer_dropdown == "none":
+            return gr.update()
+        Chosen_AutoencoderKL = {
+            "AutoencoderKLWan": AutoencoderKLWan,
+            "AutoencoderKLWan3_8": AutoencoderKLWan3_8
+        }[self.config['vae_kwargs'].get('vae_type', 'AutoencoderKLWan')]
+        self.vae = Chosen_AutoencoderKL.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['vae_kwargs'].get('vae_subpath', 'vae')),
+            additional_kwargs=OmegaConf.to_container(self.config['vae_kwargs']),
+        ).to(self.weight_dtype)
+        # Get Transformer
+        self.transformer = Wan2_2Transformer3DModel.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['transformer_additional_kwargs'].get('transformer_low_noise_model_subpath', 'transformer')),
+            transformer_additional_kwargs=OmegaConf.to_container(self.config['transformer_additional_kwargs']),
+            low_cpu_mem_usage=True,
+            torch_dtype=self.weight_dtype,
+        )
+        if self.config['transformer_additional_kwargs'].get('transformer_combination_type', 'single') == "moe":
+            self.transformer_2 = Wan2_2Transformer3DModel.from_pretrained(
+                os.path.join(diffusion_transformer_dropdown, self.config['transformer_additional_kwargs'].get('transformer_high_noise_model_subpath', 'transformer')),
+                transformer_additional_kwargs=OmegaConf.to_container(self.config['transformer_additional_kwargs']),
+                low_cpu_mem_usage=True,
+                torch_dtype=self.weight_dtype,
+            )
+        else:
+            self.transformer_2 = None
+        # Get Tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['text_encoder_kwargs'].get('tokenizer_subpath', 'tokenizer')),
+        )
+        # Get Text encoder
+        self.text_encoder = WanT5EncoderModel.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['text_encoder_kwargs'].get('text_encoder_subpath', 'text_encoder')),
+            additional_kwargs=OmegaConf.to_container(self.config['text_encoder_kwargs']),
+            low_cpu_mem_usage=True,
+            torch_dtype=self.weight_dtype,
+        )
+        self.text_encoder = self.text_encoder.eval()
+        Chosen_Scheduler = self.scheduler_dict[list(self.scheduler_dict.keys())[0]]
+        self.scheduler = Chosen_Scheduler(
+            **filter_kwargs(Chosen_Scheduler, OmegaConf.to_container(self.config['scheduler_kwargs']))
+        )
+        # Get pipeline
+        if self.model_type == "Inpaint":
+            if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                self.pipeline = Wan2_2FunInpaintPipeline(
+                    vae=self.vae,
+                    tokenizer=self.tokenizer,
+                    text_encoder=self.text_encoder,
+                    transformer=self.transformer,
+                    transformer_2=self.transformer_2,
+                    scheduler=self.scheduler,
+                )
+            else:
+                self.pipeline = Wan2_2FunPipeline(
+                    vae=self.vae,
+                    tokenizer=self.tokenizer,
+                    text_encoder=self.text_encoder,
+                    transformer=self.transformer,
+                    transformer_2=self.transformer_2,
+                    scheduler=self.scheduler,
+                )
+        else:
+            self.pipeline = Wan2_2FunControlPipeline(
+                vae=self.vae,
+                tokenizer=self.tokenizer,
+                text_encoder=self.text_encoder,
+                transformer=self.transformer,
+                transformer_2=self.transformer_2,
+                scheduler=self.scheduler,
+            )
+        if self.ulysses_degree > 1 or self.ring_degree > 1:
+            from functools import partial
+            self.transformer.enable_multi_gpus_inference()
+            if self.transformer_2 is not None:
+                self.transformer_2.enable_multi_gpus_inference()
+            if self.fsdp_dit:
+                shard_fn = partial(shard_model, device_id=self.device, param_dtype=self.weight_dtype)
+                self.pipeline.transformer = shard_fn(self.pipeline.transformer)
+                if self.transformer_2 is not None:
+                    self.pipeline.transformer_2 = shard_fn(self.pipeline.transformer_2)
+                print("Add FSDP DIT")
+            if self.fsdp_text_encoder:
+                shard_fn = partial(shard_model, device_id=self.device, param_dtype=self.weight_dtype)
+                self.pipeline.text_encoder = shard_fn(self.pipeline.text_encoder)
+                print("Add FSDP TEXT ENCODER")
+        if self.compile_dit:
+            for i in range(len(self.pipeline.transformer.blocks)):
+                self.pipeline.transformer.blocks[i] = torch.compile(self.pipeline.transformer.blocks[i])
+            if self.transformer_2 is not None:
+                for i in range(len(self.pipeline.transformer_2.blocks)):
+                    self.pipeline.transformer_2.blocks[i] = torch.compile(self.pipeline.transformer_2.blocks[i])
+            print("Add Compile")
+        if self.GPU_memory_mode == "sequential_cpu_offload":
+            replace_parameters_by_name(self.transformer, ["modulation",], device=self.device)
+            self.transformer.freqs = self.transformer.freqs.to(device=self.device)
+            if self.transformer_2 is not None:
+                replace_parameters_by_name(self.transformer_2, ["modulation",], device=self.device)
+                self.transformer_2.freqs = self.transformer_2.freqs.to(device=self.device)
+            self.pipeline.enable_sequential_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+            convert_model_weight_to_float8(self.transformer, exclude_module_name=["modulation",], device=self.device)
+            convert_weight_dtype_wrapper(self.transformer, self.weight_dtype)
+            if self.transformer_2 is not None:
+                convert_model_weight_to_float8(self.transformer_2, exclude_module_name=["modulation",], device=self.device)
+                convert_weight_dtype_wrapper(self.transformer_2, self.weight_dtype)
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload":
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_full_load_and_qfloat8":
+            convert_model_weight_to_float8(self.transformer, exclude_module_name=["modulation",], device=self.device)
+            convert_weight_dtype_wrapper(self.transformer, self.weight_dtype)
+            if self.transformer_2 is not None:
+                convert_model_weight_to_float8(self.transformer_2, exclude_module_name=["modulation",], device=self.device)
+                convert_weight_dtype_wrapper(self.transformer_2, self.weight_dtype)
+            self.pipeline.to(self.device)
+        else:
+            self.pipeline.to(self.device)
+        print("Update diffusion transformer done")
+        return gr.update()
+    @timer
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
+        prompt_textbox,
+        negative_prompt_textbox,
+        sampler_dropdown,
+        sample_step_slider,
+        resize_method,
+        width_slider,
+        height_slider,
+        base_resolution,
+        generation_method,
+        length_slider,
+        overlap_video_length,
+        partial_video_length,
+        cfg_scale_slider,
+        start_image,
+        end_image,
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        ref_image = None,
+        enable_teacache = None,
+        teacache_threshold = None,
+        num_skip_start_steps = None,
+        teacache_offload = None,
+        cfg_skip_ratio = None,
+        enable_riflex = None,
+        riflex_k = None,
+        base_model_2_dropdown=None,
+        lora_model_2_dropdown=None,
+        fps = None,
+        is_api = False,
+    ):
+        self.clear_cache()
+        print(f"Input checking.")
+        _, comment = self.input_check(
+            resize_method, generation_method, start_image, end_image, validation_video,control_video, is_api
+        )
+        print(f"Input checking down")
+        if comment != "OK":
+            return "", comment
+        is_image = True if generation_method == "Image Generation" else False
+        if self.base_model_path != base_model_dropdown:
+            self.update_base_model(base_model_dropdown)
+        if self.base_model_2_path != base_model_2_dropdown:
+            self.update_lora_model(base_model_2_dropdown, is_checkpoint_2=True)
+        if self.lora_model_path != lora_model_dropdown:
+            self.update_lora_model(lora_model_dropdown)
+        if self.lora_model_2_path != lora_model_2_dropdown:
+            self.update_lora_model(lora_model_2_dropdown, is_checkpoint_2=True)
+        print(f"Load scheduler.")
+        scheduler_config = self.pipeline.scheduler.config
+        if sampler_dropdown == "Flow_Unipc" or sampler_dropdown == "Flow_DPM++":
+            scheduler_config['shift'] = 1
+        self.pipeline.scheduler = self.scheduler_dict[sampler_dropdown].from_config(scheduler_config)
+        print(f"Load scheduler down.")
+        if resize_method == "Resize according to Reference":
+            print(f"Calculate height and width according to Reference.")
+            height_slider, width_slider = self.get_height_width_from_reference(
+                base_resolution, start_image, validation_video, control_video,
+            )
+        if self.lora_model_path != "none":
+            print(f"Merge Lora.")
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if self.transformer_2 is not None:
+                self.pipeline = merge_lora(self.pipeline, self.lora_model_2_path, multiplier=lora_alpha_slider, sub_transformer_name="transformer_2")
+            print(f"Merge Lora done.")
+        coefficients = get_teacache_coefficients(self.diffusion_transformer_dropdown) if enable_teacache else None
+        if coefficients is not None:
+            print(f"Enable TeaCache with threshold {teacache_threshold} and skip the first {num_skip_start_steps} steps.")
+            self.pipeline.transformer.enable_teacache(
+                coefficients, sample_step_slider, teacache_threshold, num_skip_start_steps=num_skip_start_steps, offload=teacache_offload
+            )
+            if self.transformer_2 is not None:
+                self.pipeline.transformer_2.share_teacache(self.pipeline.transformer)
+        else:
+            print(f"Disable TeaCache.")
+            self.pipeline.transformer.disable_teacache()
+            if self.transformer_2 is not None:
+                self.pipeline.transformer_2.disable_teacache()
+        if cfg_skip_ratio is not None and cfg_skip_ratio >= 0:
+            print(f"Enable cfg_skip_ratio {cfg_skip_ratio}.")
+            self.pipeline.transformer.enable_cfg_skip(cfg_skip_ratio, sample_step_slider)
+            if self.transformer_2 is not None:
+                self.pipeline.transformer_2.share_cfg_skip(self.pipeline.transformer)
+        print(f"Generate seed.")
+        if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
+        else: seed_textbox = np.random.randint(0, 1e10)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed_textbox))
+        print(f"Generate seed done.")
+        if fps is None:
+            fps = 16
+        boundary = self.config['transformer_additional_kwargs'].get('boundary', 0.875)
+        if enable_riflex:
+            print(f"Enable riflex")
+            latent_frames = (int(length_slider) - 1) // self.vae.config.temporal_compression_ratio + 1
+            self.pipeline.transformer.enable_riflex(k = riflex_k, L_test = latent_frames if not is_image else 1)
+            if self.transformer_2 is not None:
+                self.pipeline.transformer_2.enable_riflex(k = riflex_k, L_test = latent_frames if not is_image else 1)
+        try:
+            print(f"Generation.")
+            if self.model_type == "Inpaint":
+                if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                    if validation_video is not None:
+                        input_video, input_video_mask, _, clip_image = get_video_to_video_latent(validation_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), validation_video_mask=validation_video_mask, fps=fps)
+                    else:
+                        input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator,
+                        video               = input_video,
+                        mask_video          = input_video_mask,
+                        boundary            = boundary
+                    ).videos
+                else:
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator,
+                        boundary            = boundary
+                    ).videos
+            else:
+                inpaint_video, inpaint_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, video_length=length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                if ref_image is not None:
+                    ref_image = get_image_latent(ref_image, sample_size=(height_slider, width_slider))
+                input_video, input_video_mask, _, _ = get_video_to_video_latent(control_video, video_length=length_slider if not is_image else 1, sample_size=(height_slider, width_slider), fps=fps, ref_image=None)
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    num_frames          = length_slider if not is_image else 1,
+                    generator           = generator,
+                    video      = inpaint_video,
+                    mask_video   = inpaint_video_mask,
+                    control_video = input_video,
+                    ref_image = ref_image,
+                    boundary = boundary,
+                ).videos
+            print(f"Generation done.")
+        except Exception as e:
+            self.auto_model_clear_cache(self.pipeline.transformer)
+            self.auto_model_clear_cache(self.pipeline.text_encoder)
+            self.auto_model_clear_cache(self.pipeline.vae)
+            self.clear_cache()
+            print(f"Error. error information is {str(e)}")
+            if self.lora_model_path != "none":
+                self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if is_api:
+                return "", f"Error. error information is {str(e)}"
+            else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+        self.clear_cache()
+        # lora part
+        if self.lora_model_path != "none":
+            print(f"Unmerge Lora.")
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            print(f"Unmerge Lora done.")
+        print(f"Saving outputs.")
+        save_sample_path = self.save_outputs(
+            is_image, length_slider, sample, fps=fps
+        )
+        print(f"Saving outputs done.")
+        if is_image or length_slider == 1:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
+        else:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
+Wan2_2_Fun_Controller_Host = Wan2_2_Fun_Controller
+Wan2_2_Fun_Controller_Client = Fun_Controller_Client
+def ui(GPU_memory_mode, scheduler_dict, config_path, compile_dit, weight_dtype, savedir_sample=None):
+    controller = Wan2_2_Fun_Controller(
+        GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
+        config_path=config_path, compile_dit=compile_dit,
+        weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+    )
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan2.2-Fun:
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 5 seconds, and fps 16 (frames 1 to 81), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/VideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            config_dropdown, config_refresh_button = create_config(controller)
+            model_type = create_model_type(visible=True)
+            diffusion_transformer_dropdown, diffusion_transformer_refresh_button = \
+                create_model_checkpoints(controller, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button = \
+                create_finetune_models_checkpoints(controller, visible=True, add_checkpoint_2=True)
+            base_model_dropdown, base_model_2_dropdown = base_model_dropdown
+            lora_model_dropdown, lora_model_2_dropdown = lora_model_dropdown
+            with gr.Row():
+                enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload = \
+                    create_teacache_params(True, 0.10, 1, False)
+                cfg_skip_ratio = create_cfg_skip_params(0)
+                enable_riflex, riflex_k = create_cfg_riflex_k(False, 6)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=161,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video Control (视频控制)"], prompt_textbox, support_ref_image=True
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            config_dropdown.change(
+                fn=controller.update_config,
+                inputs=[config_dropdown],
+                outputs=[]
+            )
+            model_type.change(
+                fn=controller.update_model_type,
+                inputs=[model_type],
+                outputs=[]
+            )
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return [gr.update(visible=True, maximum=161, value=81, interactive=True), gr.update(visible=False), gr.update(visible=False)]
+                elif generation_method == "Image Generation":
+                    return [gr.update(minimum=1, maximum=1, value=1, interactive=False), gr.update(visible=False), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=True, maximum=1344), gr.update(visible=True), gr.update(visible=True)]
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider, overlap_video_length, partial_video_length]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image,
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    overlap_video_length,
+                    partial_video_length,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength,
+                    seed_textbox,
+                    ref_image,
+                    enable_teacache,
+                    teacache_threshold,
+                    num_skip_start_steps,
+                    teacache_offload,
+                    cfg_skip_ratio,
+                    enable_riflex,
+                    riflex_k,
+                    base_model_2_dropdown,
+                    lora_model_2_dropdown
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+def ui_host(GPU_memory_mode, scheduler_dict, model_name, model_type, config_path, compile_dit, weight_dtype, savedir_sample=None):
+    controller = Wan2_2_Fun_Controller_Host(
+        GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type,
+        config_path=config_path, compile_dit=compile_dit,
+        weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+    )
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan2.2-Fun:
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 5 seconds, and fps 16 (frames 1 to 81), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/VideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_fake_model_type(visible=False)
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = \
+                create_fake_finetune_models_checkpoints(visible=True, add_checkpoint_2=True)
+            base_model_dropdown, base_model_2_dropdown = base_model_dropdown
+            lora_model_dropdown, lora_model_2_dropdown = lora_model_dropdown
+            with gr.Row():
+                enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload = \
+                    create_teacache_params(True, 0.10, 1, False)
+                cfg_skip_ratio = create_cfg_skip_params(0)
+                enable_riflex, riflex_k = create_cfg_riflex_k(False, 6)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=161,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video Control (视频控制)"], prompt_textbox, support_ref_image=True
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=1, maximum=161, value=81, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image,
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    overlap_video_length,
+                    partial_video_length,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength,
+                    seed_textbox,
+                    ref_image,
+                    enable_teacache,
+                    teacache_threshold,
+                    num_skip_start_steps,
+                    teacache_offload,
+                    cfg_skip_ratio,
+                    enable_riflex,
+                    riflex_k,
+                    base_model_2_dropdown,
+                    lora_model_2_dropdown
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+def ui_client(scheduler_dict, model_name, savedir_sample=None):
+    controller = Wan2_2_Fun_Controller_Client(scheduler_dict, savedir_sample)
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan2.2-Fun:
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 5 seconds, and fps 16 (frames 1 to 81), as well as image generated videos.
+            [Github](https://github.com/aigc-apps/VideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = \
+                create_fake_finetune_models_checkpoints(visible=True, add_checkpoint_2=True)
+            base_model_dropdown, base_model_2_dropdown = base_model_dropdown
+            lora_model_dropdown, lora_model_2_dropdown = lora_model_dropdown
+            with gr.Row():
+                enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload = \
+                    create_teacache_params(True, 0.10, 1, False)
+                cfg_skip_ratio = create_cfg_skip_params(0)
+                enable_riflex, riflex_k = create_cfg_riflex_k(False, 6)
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller, maximum_step=50)
+                    resize_method, width_slider, height_slider, base_resolution = create_fake_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=161,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video, ref_image = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                result_image, result_video, infer_progress = create_ui_outputs()
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=5, maximum=161, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [image_to_video_col, video_to_video_col, start_image, end_image, validation_video, validation_video_mask]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sampler_dropdown,
+                    sample_step_slider,
+                    resize_method,
+                    width_slider,
+                    height_slider,
+                    base_resolution,
+                    generation_method,
+                    length_slider,
+                    cfg_scale_slider,
+                    start_image,
+                    end_image,
+                    validation_video,
+                    validation_video_mask,
+                    denoise_strength,
+                    seed_textbox,
+                    ref_image,
+                    enable_teacache,
+                    teacache_threshold,
+                    num_skip_start_steps,
+                    teacache_offload,
+                    cfg_skip_ratio,
+                    enable_riflex,
+                    riflex_k,
+                    base_model_2_dropdown,
+                    lora_model_2_dropdown
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller