Spaces:

haodongli
/

DVD

Running on Zero

App Files Files Community

haodongli commited on Mar 13

Commit

4b35c4e

1 Parent(s): 2cfedc8

init-1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +34 -0
README.md +15 -0
app.py +77 -0
ckpt/model_config.yaml +7 -0
configs/img_config/data_diode_all.yaml +8 -0
configs/img_config/data_eth3d.yaml +11 -0
configs/img_config/data_kitti_eigen_test.yaml +8 -0
configs/img_config/data_nyu_test.yaml +8 -0
configs/img_config/data_scannet_val.yaml +6 -0
configs/scannetv1_test.txt +312 -0
configs/vid_config/img_sintel.yaml +3 -0
configs/vid_config/vid_bonn.yaml +2 -0
configs/vid_config/vid_kitti.yaml +2 -0
configs/vid_config/vid_scannet.yaml +3 -0
configs/vid_config/vid_sintel.yaml +3 -0
diffsynth/__init__.py +4 -0
diffsynth/configs/__init__.py +0 -0
diffsynth/configs/model_config.py +705 -0
diffsynth/data/__init__.py +1 -0
diffsynth/data/video.py +244 -0
diffsynth/distributed/__init__.py +0 -0
diffsynth/distributed/xdit_context_parallel.py +129 -0
diffsynth/models/__init__.py +1 -0
diffsynth/models/downloader.py +116 -0
diffsynth/models/model_manager.py +416 -0
diffsynth/models/tiler.py +234 -0
diffsynth/models/utils.py +185 -0
diffsynth/models/wan_video_camera_controller.py +221 -0
diffsynth/models/wan_video_dit.py +974 -0
diffsynth/models/wan_video_image_encoder.py +902 -0
diffsynth/models/wan_video_motion_controller.py +44 -0
diffsynth/models/wan_video_text_encoder.py +269 -0
diffsynth/models/wan_video_vace.py +113 -0
diffsynth/models/wan_video_vae.py +828 -0
diffsynth/pipelines/__init__.py +1 -0
diffsynth/pipelines/wan_video_new_determine.py +1730 -0
diffsynth/schedulers/__init__.py +3 -0
diffsynth/schedulers/continuous_ode.py +59 -0
diffsynth/schedulers/ddim.py +105 -0
diffsynth/schedulers/flow_match.py +116 -0
diffsynth/util/alignment.py +131 -0
diffsynth/util/depth_transform.py +98 -0
diffsynth/util/metric.py +337 -0
diffsynth/util/normal_utils.py +78 -0
diffsynth/util/seed_all.py +33 -0
diffsynth/vram_management/__init__.py +2 -0
diffsynth/vram_management/gradient_checkpointing.py +34 -0
diffsynth/vram_management/layers.py +167 -0
examples/__init__.py +0 -0
examples/dataset/__init__.py +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+output/
+models/
+._____temp/
+.vscode/
+*__pycache__/*
+*.pyc
+video/
+*.safetensors
+*.pth
+omni/
+pcd/
+models/
+models_ms/
+!diffsynth/models**
+diffsynth/models/__pycache__/
+ckpt/DVD/
+outputs/
+inference_results/
+*.mp4
+ckpt/DVD
+.msc
+.mv
+ckpt/.cache
+ckpt/.gitattributes
+ckpt/README.md
+overlap_plots/
+test_script/test_from_trained_all_vid_test.py
+test_script/test_single_video_batch.py
+DVD.egg-info/
+infer_bash/video_test.sh
+ckpt/test/
+!demo/robot_navi.mp4
+!demo/drone.mp4

README.md CHANGED Viewed

@@ -5,10 +5,25 @@ colorFrom: green
 colorTo: yellow
 sdk: gradio
 sdk_version: 6.9.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Official demo of DVD (https://dvd-project.github.io/)
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: yellow
 sdk: gradio
 sdk_version: 6.9.0
+python_version: 3.10.20
 app_file: app.py
 pinned: false
 license: apache-2.0
 short_description: Official demo of DVD (https://dvd-project.github.io/)
+tags:
+  - video diffusion
+  - video depth estimation
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+If you find our work useful in your research, please consider citing our paper🌹:
+```
+@article{zhang2026dvd,
+  title={DVD: Deterministic Video Depth Estimation with Generative Priors},
+  author={Zhang, Hongfei and Chen, Harold Haodong and Liao, Chenfei and He, Jing and Zhang, Zixin and Li, Haodong and Liang, Yihao and Chen, Kanghao and Ren, Bin and Zheng, Xu and Yang, Shuai and Zhou, Kun and Li, Yinchuan and Sebe, Nicu and Chen, Ying-Cong},
+  journal={arXiv preprint arXiv:2603.12250},
+  year={2026}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import spaces  # must be first!
+import os
+from pathlib import Path
+REPO_ROOT = Path(__file__).resolve().parent
+GRADIO_TMP = REPO_ROOT / ".gradio_cache"
+GRADIO_TMP.mkdir(parents=True, exist_ok=True)
+os.environ["GRADIO_TEMP_DIR"] = str(GRADIO_TMP)
+print(f"Gradio temp/cache dir: {GRADIO_TMP}")
+import torch
+from argparse import Namespace
+import subprocess
+from test_script.test_single_video import *
+import gradio as gr
+device = "cuda" if torch.cuda.is_available() else "cpu"
+yaml_args = OmegaConf.load(f"{REPO_ROOT}/ckpt/model_config.yaml")
+pipeline = None
+@spaces.GPU
+def fn(input_video):
+    global pipeline, yaml_args, device
+    if pipeline is None:
+        if not os.path.exists(f"{REPO_ROOT}/ckpt/model.safetensors"):
+            subprocess.run(["bash", f"{REPO_ROOT}/infer_bash/download_ckpt.sh"], check=True)
+        pipeline = load_model(f"{REPO_ROOT}/ckpt", yaml_args)
+    input_video_basename = os.path.basename(input_video)
+    input_tensor, orig_size, origin_fps = load_video_data(Namespace(
+        input_video=input_video,
+        height=480,
+        width=640,
+    ))
+    depth = predict_depth(pipeline, input_tensor, orig_size, Namespace(
+        window_size=81,
+        overlap=21
+    ))
+    output_video = save_results(depth, origin_fps, Namespace(
+        input_video=input_video,
+        output_dir=REPO_ROOT,
+        grayscale=False
+    ))
+    return output_video
+if __name__ == "__main__":
+    inputs = [
+        gr.Video(label="Input Video", autoplay=True),
+    ]
+    outputs = [
+        gr.Video(label="Output Video", autoplay=True),
+    ]
+    demo = gr.Interface(
+        fn=fn,
+        title="DVD: Deterministic Video Depth Estimation with Generative Priors",
+        description="""
+            <strong>Please consider starring <span style="color: orange">&#9733;</span> our <a href="https://github.com/EnVision-Research/DVD" target="_blank" rel="noopener noreferrer">GitHub Repo</a> if you find this demo useful!</strong>
+        """,
+        inputs=inputs,
+        outputs=outputs,
+        examples=[
+            [f"{REPO_ROOT}/demo/drone.mp4"],
+            [f"{REPO_ROOT}/demo/robot_navi.mp4"]
+        ]
+    )
+    demo.queue(default_concurrency_limit=1)
+    demo.launch(
+        # server_name="0.0.0.0",
+        # server_port=1324,
+    )

ckpt/model_config.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+model_id_with_origin_paths: Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors,Wan-AI/Wan2.1-T2V-1.3B:Wan2.1_VAE.pth
+trainable_models: dit
+mode: regression
+denoise_step: 0.5
+training_target: x
+lora_base_model: dit
+lora_rank: 512

configs/img_config/data_diode_all.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: diode
+disp_name: diode_val_all
+# dir: diode
+# dataset_dir: diode
+# filename_ls_path: data_split/diode/diode_val_all_filename_list.txt
+processing_res: 640
+dir: diode
+filename: data_split/diode/diode_val_all_filename_list.txt

configs/img_config/data_eth3d.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: eth3d
+disp_name: eth3d_full
+# dataset_dir: eth3d
+# dir: eth3d/eth3d.tar
+# filename_ls_path: data_split/eth3d/eth3d_filename_list.txt
+dir: eth3d
+# dir: eth3d/eth3d.tar
+filename: data_split/eth3d/eth3d_filename_list.txt
+resize_to_hw: [480, 720]
+# processing_res: 768
+# alignment_max_res: 1024

configs/img_config/data_kitti_eigen_test.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: kitti
+disp_name: kitti_eigen_test_full
+# dataset_dir: kitti
+# filename_ls_path: data_split/kitti/eigen_test_files_with_gt.txt
+kitti_bm_crop: true
+valid_mask_crop: eigen
+dir: kitti
+filename: data_split/kitti/eigen_test_files_with_gt.txt

configs/img_config/data_nyu_test.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: nyu_v2
+disp_name: nyu_test_full
+# dataset_dir: nyuv2
+# filename_ls_path: data_split/nyu/labeled/filename_list_test.txt
+eigen_valid_mask: true
+dir: nyuv2
+filename: data_split/nyu/labeled/filename_list_test.txt

configs/img_config/data_scannet_val.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+name: scannet
+disp_name: scannet_val_800_1
+# dataset_dir: scannet
+# filename_ls_path: data_split/scannet/scannet_val_sampled_list_800_1.txt
+dir: scannet
+filename: data_split/scannet/scannet_val_sampled_list_800_1.txt

configs/scannetv1_test.txt ADDED Viewed

	@@ -0,0 +1,312 @@

+scene0568_00
+scene0568_01
+scene0568_02
+scene0304_00
+scene0488_00
+scene0488_01
+scene0412_00
+scene0412_01
+scene0217_00
+scene0019_00
+scene0019_01
+scene0414_00
+scene0575_00
+scene0575_01
+scene0575_02
+scene0426_00
+scene0426_01
+scene0426_02
+scene0426_03
+scene0549_00
+scene0549_01
+scene0578_00
+scene0578_01
+scene0578_02
+scene0665_00
+scene0665_01
+scene0050_00
+scene0050_01
+scene0050_02
+scene0257_00
+scene0025_00
+scene0025_01
+scene0025_02
+scene0583_00
+scene0583_01
+scene0583_02
+scene0701_00
+scene0701_01
+scene0701_02
+scene0580_00
+scene0580_01
+scene0565_00
+scene0169_00
+scene0169_01
+scene0655_00
+scene0655_01
+scene0655_02
+scene0063_00
+scene0221_00
+scene0221_01
+scene0591_00
+scene0591_01
+scene0591_02
+scene0678_00
+scene0678_01
+scene0678_02
+scene0462_00
+scene0427_00
+scene0595_00
+scene0193_00
+scene0193_01
+scene0164_00
+scene0164_01
+scene0164_02
+scene0164_03
+scene0598_00
+scene0598_01
+scene0598_02
+scene0599_00
+scene0599_01
+scene0599_02
+scene0328_00
+scene0300_00
+scene0300_01
+scene0354_00
+scene0458_00
+scene0458_01
+scene0423_00
+scene0423_01
+scene0423_02
+scene0307_00
+scene0307_01
+scene0307_02
+scene0606_00
+scene0606_01
+scene0606_02
+scene0432_00
+scene0432_01
+scene0608_00
+scene0608_01
+scene0608_02
+scene0651_00
+scene0651_01
+scene0651_02
+scene0430_00
+scene0430_01
+scene0689_00
+scene0357_00
+scene0357_01
+scene0574_00
+scene0574_01
+scene0574_02
+scene0329_00
+scene0329_01
+scene0329_02
+scene0153_00
+scene0153_01
+scene0616_00
+scene0616_01
+scene0671_00
+scene0671_01
+scene0618_00
+scene0382_00
+scene0382_01
+scene0490_00
+scene0621_00
+scene0607_00
+scene0607_01
+scene0149_00
+scene0695_00
+scene0695_01
+scene0695_02
+scene0695_03
+scene0389_00
+scene0377_00
+scene0377_01
+scene0377_02
+scene0342_00
+scene0139_00
+scene0629_00
+scene0629_01
+scene0629_02
+scene0496_00
+scene0633_00
+scene0633_01
+scene0518_00
+scene0652_00
+scene0406_00
+scene0406_01
+scene0406_02
+scene0144_00
+scene0144_01
+scene0494_00
+scene0278_00
+scene0278_01
+scene0316_00
+scene0609_00
+scene0609_01
+scene0609_02
+scene0609_03
+scene0084_00
+scene0084_01
+scene0084_02
+scene0696_00
+scene0696_01
+scene0696_02
+scene0351_00
+scene0351_01
+scene0643_00
+scene0644_00
+scene0645_00
+scene0645_01
+scene0645_02
+scene0081_00
+scene0081_01
+scene0081_02
+scene0647_00
+scene0647_01
+scene0535_00
+scene0353_00
+scene0353_01
+scene0353_02
+scene0559_00
+scene0559_01
+scene0559_02
+scene0593_00
+scene0593_01
+scene0246_00
+scene0653_00
+scene0653_01
+scene0064_00
+scene0064_01
+scene0356_00
+scene0356_01
+scene0356_02
+scene0030_00
+scene0030_01
+scene0030_02
+scene0222_00
+scene0222_01
+scene0338_00
+scene0338_01
+scene0338_02
+scene0378_00
+scene0378_01
+scene0378_02
+scene0660_00
+scene0553_00
+scene0553_01
+scene0553_02
+scene0527_00
+scene0663_00
+scene0663_01
+scene0663_02
+scene0664_00
+scene0664_01
+scene0664_02
+scene0334_00
+scene0334_01
+scene0334_02
+scene0046_00
+scene0046_01
+scene0046_02
+scene0203_00
+scene0203_01
+scene0203_02
+scene0088_00
+scene0088_01
+scene0088_02
+scene0088_03
+scene0086_00
+scene0086_01
+scene0086_02
+scene0670_00
+scene0670_01
+scene0256_00
+scene0256_01
+scene0256_02
+scene0249_00
+scene0441_00
+scene0658_00
+scene0704_00
+scene0704_01
+scene0187_00
+scene0187_01
+scene0131_00
+scene0131_01
+scene0131_02
+scene0207_00
+scene0207_01
+scene0207_02
+scene0461_00
+scene0011_00
+scene0011_01
+scene0343_00
+scene0251_00
+scene0077_00
+scene0077_01
+scene0684_00
+scene0684_01
+scene0550_00
+scene0686_00
+scene0686_01
+scene0686_02
+scene0208_00
+scene0500_00
+scene0500_01
+scene0552_00
+scene0552_01
+scene0648_00
+scene0648_01
+scene0435_00
+scene0435_01
+scene0435_02
+scene0435_03
+scene0690_00
+scene0690_01
+scene0693_00
+scene0693_01
+scene0693_02
+scene0700_00
+scene0700_01
+scene0700_02
+scene0699_00
+scene0231_00
+scene0231_01
+scene0231_02
+scene0697_00
+scene0697_01
+scene0697_02
+scene0697_03
+scene0474_00
+scene0474_01
+scene0474_02
+scene0474_03
+scene0474_04
+scene0474_05
+scene0355_00
+scene0355_01
+scene0146_00
+scene0146_01
+scene0146_02
+scene0196_00
+scene0702_00
+scene0702_01
+scene0702_02
+scene0314_00
+scene0277_00
+scene0277_01
+scene0277_02
+scene0095_00
+scene0095_01
+scene0015_00
+scene0100_00
+scene0100_01
+scene0100_02
+scene0558_00
+scene0558_01
+scene0558_02
+scene0685_00
+scene0685_01
+scene0685_02

configs/vid_config/img_sintel.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: sintel
+dir: Sintel/training
+stack_scene_depth: false

configs/vid_config/vid_bonn.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name: bonn
2	+ dir: rgbd_bonn_dataset

configs/vid_config/vid_kitti.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name: kitti
2	+ dir: kitti_depth

configs/vid_config/vid_scannet.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: scannet
+dir: scannet
+split_ls: 'configs/scannetv1_test.txt'

configs/vid_config/vid_sintel.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+name: sintel
+dir: Sintel/training
+stack_scene_depth: true

diffsynth/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .data import *
+from .models import *
+from .pipelines import *
+from .schedulers import *

diffsynth/configs/__init__.py ADDED Viewed

File without changes

diffsynth/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,705 @@

+from typing_extensions import Literal, TypeAlias
+from ..models.wan_video_dit import WanModel
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_motion_controller import WanMotionControllerModel
+from ..models.wan_video_text_encoder import WanTextEncoder
+from ..models.wan_video_vace import VaceWanModel
+from ..models.wan_video_vae import WanVideoVAE
+model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash, state_dict_keys_hash_with_shape, model_names, model_classes, model_resource)
+     (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6d6ccde6845b95ad9114ab993d917893", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "349723183fc063b2bfc10bb2835cf677", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "efa44cddf936c70abd0ea28b6cbe946c", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "3ef3b1f8e1dab83d5b71fd7b617f859f", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "70ddad9d3a133785da5ea371aae09504", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "26bde73488a92e64cc20b0a7485b9e5b", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "ac6a5aa74f4a0aab6f64eb9a72f19901", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "b61c605c2adbd23124d152ed28e049ae", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "a61453409b67cd3246cf0c3bebad47ba", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
+    (None, "7a513e1f257a861512b1afd387a8ecd9", ["wan_video_dit", "wan_video_vace"], [WanModel, VaceWanModel], "civitai"),
+    (None, "cb104773c6c2cb6df4f9529ad5c60d0b", ["wan_video_dit"], [WanModel], "diffusers"),
+    (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
+    (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
+    (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "dbd5ec76bbf977983f972c151d545389", ["wan_video_motion_controller"], [WanMotionControllerModel], "civitai"),
+]
+huggingface_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (architecture_in_huggingface_config, huggingface_lib, model_name, redirected_architecture)
+    ("ChatGLMModel", "diffsynth.models.kolors_text_encoder", "kolors_text_encoder", None),
+    ("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
+    ("BloomForCausalLM", "transformers.models.bloom.modeling_bloom", "beautiful_prompt", None),
+    ("Qwen2ForCausalLM", "transformers.models.qwen2.modeling_qwen2", "qwen_prompt", None),
+    ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
+    ("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
+    ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
+    ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
+    ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
+    ("LlavaForConditionalGeneration", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoMLLMEncoder"),
+    ("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
+    ("Qwen2_5_VLForConditionalGeneration", "diffsynth.models.qwenvl", "qwenvl", "Qwen25VL_7b_Embedder"),
+]
+patch_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash_with_shape, model_name, model_class, extra_kwargs)
+]
+preset_models_on_huggingface = {
+    "HunyuanDiT": [
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    "stable-video-diffusion-img2vid-xt": [
+        ("stabilityai/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("benjamin-paine/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("Yntec/Dreamshaper8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("gemasai/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("stabilityai/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("frankjoshua/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("stabilityai/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        ("stabilityai/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
+    ],
+    "StableDiffusion3_without_T5": [
+        ("stabilityai/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    "ControlNet_union_sdxl_promax": [
+        ("xinsir/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("guoyww/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("guoyww/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": [
+        ("Qwen/Qwen2-1.5B-Instruct", "config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "generation_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "model.safetensors", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "special_tokens_map.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "tokenizer.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "tokenizer_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "merges.txt", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "vocab.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+    ],
+    # Omost prompt
+    "OmostPrompt":[
+        ("lllyasviel/omost-llama-3-8b-4bits", "model-00001-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "model-00002-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "tokenizer.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "tokenizer_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "generation_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        ("Helsinki-NLP/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("h94/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("h94/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("h94/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("h94/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
+    "SDXL-vae-fp16-fix": [
+        ("madebyollin/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
+    ],
+    # Kolors
+    "Kolors": [
+        ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+        ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+    ],
+    # FLUX
+    "FLUX.1-dev": [
+        ("black-forest-labs/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+        ("black-forest-labs/FLUX.1-dev", "flux1-dev.safetensors", "models/FLUX/FLUX.1-dev"),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            ("InstantX/FLUX.1-dev-IP-Adapter", "ip-adapter.bin", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter"),
+            ("google/siglip-so400m-patch14-384", "model.safetensors", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+            ("google/siglip-so400m-patch14-384", "config.json", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # RIFE
+    "RIFE": [
+        ("AlexWortega/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # CogVideo
+    "CogVideoX-5B": [
+        ("THUDM/CogVideoX-5b", "text_encoder/config.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "transformer/config.json", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "vae/diffusion_pytorch_model.safetensors", "models/CogVideo/CogVideoX-5b/vae"),
+    ],
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        ("stabilityai/stable-diffusion-3.5-large", "sd3.5_large.safetensors", "models/stable_diffusion_3"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+}
+preset_models_on_modelscope = {
+    # Hunyuan DiT
+    "HunyuanDiT": [
+        ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    # Stable Video Diffusion
+    "stable-video-diffusion-img2vid-xt": [
+        ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    # ExVideo
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+    "ExVideo-CogVideoX-LoRA-129f-v1": [
+        ("ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1", "ExVideo-CogVideoX-LoRA-129f-v1.safetensors", "models/lora"),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    "AingDiffusion_v12": [
+        ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0": [
+        ("sd_lora/zyd232_ChineseInkStyle_SDXL_v1_0", "zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", "models/lora"),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
+    ],
+    "StableDiffusion3_without_T5": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    "ControlNet_union_sdxl_promax": [
+        ("AI-ModelScope/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "Annotators:Depth": [
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Softedge": [
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "Annotators:Lineart": [
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "Annotators:Normal": [
+        ("sd_lora/Annotators", "scannet.pt", "models/Annotators"),
+    ],
+    "Annotators:Openpose": [
+        ("sd_lora/Annotators", "body_pose_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "facenet.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "hand_pose_model.pth", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": {
+        "file_list": [
+            ("qwen/Qwen2-1.5B-Instruct", "config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "generation_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "model.safetensors", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "special_tokens_map.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "tokenizer.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "tokenizer_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "merges.txt", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "vocab.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ],
+        "load_path": [
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ],
+    },
+    # Beautiful Prompt
+    "BeautifulPrompt": {
+        "file_list": [
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ],
+        "load_path": [
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ],
+    },
+    # Omost prompt
+    "OmostPrompt": {
+        "file_list": [
+            ("Omost/omost-llama-3-8b-4bits", "model-00001-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "model-00002-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "tokenizer.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "tokenizer_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "generation_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ],
+        "load_path": [
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ],
+    },
+    # Translator
+    "opus-mt-zh-en": {
+        "file_list": [
+            ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+        ],
+        "load_path": [
+            "models/translator/opus-mt-zh-en",
+        ],
+    },
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
+    # Kolors
+    "Kolors": {
+        "file_list": [
+            ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+            ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+        ],
+        "load_path": [
+            "models/kolors/Kolors/text_encoder",
+            "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    "SDXL-vae-fp16-fix": [
+        ("AI-ModelScope/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
+    ],
+    # FLUX
+    "FLUX.1-dev": {
+        "file_list": [
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            ("AI-ModelScope/FLUX.1-dev", "flux1-dev.safetensors", "models/FLUX/FLUX.1-dev"),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+        ],
+    },
+    "FLUX.1-schnell": {
+        "file_list": [
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            ("AI-ModelScope/FLUX.1-schnell", "flux1-schnell.safetensors", "models/FLUX/FLUX.1-schnell"),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors"
+        ],
+    },
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
+        ("InstantX/FLUX.1-dev-Controlnet-Union-alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Depth": [
+        ("jasperai/Flux.1-dev-Controlnet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
+        ("jasperai/Flux.1-dev-Controlnet-Surface-Normals", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Upscaler": [
+        ("jasperai/Flux.1-dev-Controlnet-Upscaler", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro"),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            ("InstantX/FLUX.1-dev-IP-Adapter", "ip-adapter.bin", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter"),
+            ("AI-ModelScope/siglip-so400m-patch14-384", "model.safetensors", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+            ("AI-ModelScope/siglip-so400m-patch14-384", "config.json", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    "InfiniteYou":{
+        "file_list":[
+            ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
+            ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors", "models/InfiniteYou/InfuseNetModel"),
+            ("ByteDance/InfiniteYou", "infu_flux_v1.0/aes_stage2/image_proj_model.bin", "models/InfiniteYou"),
+            ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/1k3d68.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
+            ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/2d106det.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
+            ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/genderage.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
+            ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/glintr100.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
+            ("ByteDance/InfiniteYou", "supports/insightface/models/antelopev2/scrfd_10g_bnkps.onnx", "models/InfiniteYou/insightface/models/antelopev2"),
+        ],
+        "load_path":[
+            [
+                "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00001-of-00002.safetensors",
+                "models/InfiniteYou/InfuseNetModel/diffusion_pytorch_model-00002-of-00002.safetensors"
+            ],
+            "models/InfiniteYou/image_proj_model.bin",
+            ],
+    },
+    # ESRGAN
+    "ESRGAN_x4": [
+        ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("AI-ModelScope/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # Omnigen
+    "OmniGen-v1": {
+        "file_list": [
+            ("BAAI/OmniGen-v1", "vae/diffusion_pytorch_model.safetensors", "models/OmniGen/OmniGen-v1/vae"),
+            ("BAAI/OmniGen-v1", "model.safetensors", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "special_tokens_map.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer_config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer.json", "models/OmniGen/OmniGen-v1"),
+        ],
+        "load_path": [
+            "models/OmniGen/OmniGen-v1/vae/diffusion_pytorch_model.safetensors",
+            "models/OmniGen/OmniGen-v1/model.safetensors",
+        ]
+    },
+    # CogVideo
+    "CogVideoX-5B": {
+        "file_list": [
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/config.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/config.json", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "vae/diffusion_pytorch_model.safetensors", "models/CogVideo/CogVideoX-5b/vae"),
+        ],
+        "load_path": [
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+            "models/CogVideo/CogVideoX-5b/transformer",
+            "models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        ("AI-ModelScope/stable-diffusion-3.5-large", "sd3.5_large.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "StableDiffusion3.5-medium": [
+        ("AI-ModelScope/stable-diffusion-3.5-medium", "sd3.5_medium.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "StableDiffusion3.5-large-turbo": [
+        ("AI-ModelScope/stable-diffusion-3.5-large-turbo", "sd3.5_large_turbo.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "HunyuanVideo":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/vae/pytorch_model.pt", "models/HunyuanVideo/vae"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideo/transformers")
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
+        ],
+    },
+    "HunyuanVideoI2V":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideoI2V/text_encoder"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00001-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00002-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00003-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model-00004-of-00004.safetensors", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "config.json", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/llava-llama-3-8b-v1_1-transformers", "model.safetensors.index.json", "models/HunyuanVideoI2V/text_encoder_2"),
+            ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/vae/pytorch_model.pt", "models/HunyuanVideoI2V/vae"),
+            ("AI-ModelScope/HunyuanVideo-I2V", "hunyuan-video-i2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideoI2V/transformers")
+        ],
+        "load_path": [
+            "models/HunyuanVideoI2V/text_encoder/model.safetensors",
+            "models/HunyuanVideoI2V/text_encoder_2",
+            "models/HunyuanVideoI2V/vae/pytorch_model.pt",
+            "models/HunyuanVideoI2V/transformers/mp_rank_00_model_states.pt"
+        ],
+    },
+    "HunyuanVideo-fp8":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/vae/pytorch_model.pt", "models/HunyuanVideo/vae"),
+            ("DiffSynth-Studio/HunyuanVideo-safetensors", "model.fp8.safetensors", "models/HunyuanVideo/transformers")
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/model.fp8.safetensors"
+        ],
+    },
+}
+Preset_model_id: TypeAlias = Literal[
+    "HunyuanDiT",
+    "stable-video-diffusion-img2vid-xt",
+    "ExVideo-SVD-128f-v1",
+    "ExVideo-CogVideoX-LoRA-129f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
+    "StableDiffusion3",
+    "StableDiffusion3_without_T5",
+    "Kolors",
+    "SDXL-vae-fp16-fix",
+    "ControlNet_union_sdxl_promax",
+    "FLUX.1-dev",
+    "FLUX.1-schnell",
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+    "jasperai/Flux.1-dev-Controlnet-Depth",
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+    "jasperai/Flux.1-dev-Controlnet-Upscaler",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+    "InstantX/FLUX.1-dev-IP-Adapter",
+    "InfiniteYou",
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
+    "QwenPrompt",
+    "OmostPrompt",
+    "ESRGAN_x4",
+    "RIFE",
+    "OmniGen-v1",
+    "CogVideoX-5B",
+    "Annotators:Depth",
+    "Annotators:Softedge",
+    "Annotators:Lineart",
+    "Annotators:Normal",
+    "Annotators:Openpose",
+    "StableDiffusion3.5-large",
+    "StableDiffusion3.5-medium",
+    "HunyuanVideo",
+    "HunyuanVideo-fp8",
+    "HunyuanVideoI2V",
+]

diffsynth/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .video import VideoData, save_video, save_frames

diffsynth/data/video.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+import imageio
+import imageio_ffmpeg as ffmpeg
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib import cm
+from PIL import Image
+from tqdm import tqdm
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i) >= ord("0") and ord(i) <= ord("9"):
+            if number == -1:
+                number = 0
+            number = number * 10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [
+        i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")
+    ]
+    file_list = [(split_file_name(file_name), file_name)
+                 for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [
+                os.path.join(folder, file_name) for file_name in file_list
+            ]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return Image.open(self.file_list[item]).convert("RGB")
+    def __del__(self):
+        pass
+def crop_and_resize(image, height, width):
+    image = np.array(image)
+    image_height, image_width, _ = image.shape
+    if image_height / image_width < height / width:
+        croped_width = int(image_height / height * width)
+        left = (image_width - croped_width) // 2
+        image = image[:, left: left + croped_width]
+        image = Image.fromarray(image).resize((width, height))
+    else:
+        croped_height = int(image_width / width * height)
+        left = (image_height - croped_height) // 2
+        image = image[left: left + croped_height, :]
+        image = Image.fromarray(image).resize((width, height))
+    return image
+class VideoData:
+    def __init__(
+        self, video_file=None, image_folder=None, height=None, width=None, **kwargs
+    ):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.set_shape(height, width)
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        width, height = frame.size
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = crop_and_resize(frame, self.height, self.width)
+        return frame
+    def __del__(self):
+        pass
+    def save_images(self, folder):
+        os.makedirs(folder, exist_ok=True)
+        for i in tqdm(range(self.__len__()), desc="Saving images"):
+            frame = self.__getitem__(i)
+            frame.save(os.path.join(folder, f"{i}.png"))
+def save_video_ffmpeg(frames, save_path, fps):
+    # frames: numpy array T H W C, dtype=uint8
+    frames = np.array(frames)
+    if frames.dtype != np.uint8:
+        frames = (frames * 255).clip(0, 255).astype(np.uint8)
+    T, H, W, C = frames.shape
+    assert C in [1, 3, 4]
+    writer = ffmpeg.write_frames(
+        save_path,
+        (W, H),
+        fps=fps,
+        quality=9,
+        macro_block_size=None,  # 避免补边
+    )
+    for frame in frames:
+        writer.send(frame)
+    writer.close()
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None, grayscale=True):
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, macro_block_size=1, ffmpeg_params=ffmpeg_params
+    )
+    if not grayscale:
+        cmap = plt.get_cmap('Spectral_r')
+        lut = (cmap(np.linspace(0, 1, 256))[:, :3] * 255).astype(np.uint8)
+    for frame in frames:
+        frame = np.array(frame)
+        if not grayscale:
+            if frame.ndim == 3:
+                if frame.shape[-1] >= 3:
+                    frame = frame[..., 0]
+                elif frame.shape[-1] == 1:
+                    frame = frame[..., 0]
+            if frame.dtype in [np.float32, np.float64]:
+                indices = (frame * 255).clip(0, 255).astype(np.uint8)
+            else:
+                indices = frame.clip(0, 255).astype(np.uint8)
+            frame_out = lut[indices]
+        else:
+            if frame.dtype in [np.float32, np.float64]:
+                frame_out = (frame * 255).clip(0, 255).astype(np.uint8)
+            else:
+                frame_out = frame.astype(np.uint8)
+            if frame_out.ndim == 3 and frame_out.shape[-1] == 1:
+                frame_out = frame_out[..., 0]
+        writer.append_data(frame_out)
+    writer.close()
+    # writer = imageio.get_writer(
+    #     save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    # )
+    # import matplotlib.pyplot as plt
+    # from matplotlib import cm
+    # cmap = plt.get_cmap('Spectral_r') if not grayscale else None
+    # for frame in frames:
+    #     frame = np.array(frame)
+    #     if not grayscale:
+    #         if frame.ndim == 3 and frame.shape[-1] >= 3:
+    #             frame = frame[..., 0]
+    #         if frame.dtype == np.uint8:
+    #             frame = frame.astype(np.float32) / 255.0
+    #         rgba_frame = cmap(frame)
+    #         frame = (rgba_frame[..., :3] * 255).clip(0, 255).astype(np.uint8)
+    #     else:
+    #         if frame.dtype == np.float32 or frame.dtype == np.float64:
+    #             frame = (frame * 255).clip(0, 255).astype(np.uint8)
+    #     writer.append_data(frame)
+    # writer.close()
+def save_frames(frames, save_path):
+    os.makedirs(save_path, exist_ok=True)
+    for i, frame in enumerate(tqdm(frames, desc="Saving images")):
+        frame.save(os.path.join(save_path, f"{i}.png"))
+        frame.save(os.path.join(save_path, f"{i}.png"))
+        frame.save(os.path.join(save_path, f"{i}.png"))

diffsynth/distributed/__init__.py ADDED Viewed

File without changes

diffsynth/distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from typing import Optional
+from einops import rearrange
+from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                     get_sequence_parallel_world_size,
+                                     get_sp_group)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(position.type(torch.float64), torch.pow(
+        10000, -torch.arange(dim//2, dtype=torch.float64, device=position.device).div(dim//2)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    s_per_rank = x.shape[1]
+    x_out = torch.view_as_complex(x.to(torch.float64).reshape(
+        x.shape[0], x.shape[1], x.shape[2], -1, 2))
+    sp_size = get_sequence_parallel_world_size()
+    sp_rank = get_sequence_parallel_rank()
+    freqs = pad_freqs(freqs, s_per_rank * sp_size)
+    freqs_rank = freqs[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
+    x_out = torch.view_as_real(x_out * freqs_rank).flatten(2)
+    return x_out.to(x.dtype)
+def usp_dit_forward(self,
+            x: torch.Tensor,
+            timestep: torch.Tensor,
+            context: torch.Tensor,
+            clip_feature: Optional[torch.Tensor] = None,
+            y: Optional[torch.Tensor] = None,
+            use_gradient_checkpointing: bool = False,
+            use_gradient_checkpointing_offload: bool = False,
+            **kwargs,
+            ):
+    t = self.time_embedding(
+        sinusoidal_embedding_1d(self.freq_dim, timestep))
+    t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+    context = self.text_embedding(context)
+    if self.has_image_input:
+        x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+        clip_embdding = self.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    x, (f, h, w) = self.patchify(x)
+    freqs = torch.cat([
+        self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+        self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+        self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+    ], dim=-1).reshape(f * h * w, 1, -1).to(x.device)
+    def create_custom_forward(module):
+        def custom_forward(*inputs):
+            return module(*inputs)
+        return custom_forward
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    for block in self.blocks:
+        if self.training and use_gradient_checkpointing:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x, context, t_mod, freqs,
+                        use_reentrant=False,
+                    )
+            else:
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    x, context, t_mod, freqs,
+                    use_reentrant=False,
+                )
+        else:
+            x = block(x, context, t_mod, freqs)
+    x = self.head(x, t)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, (f, h, w))
+    return x
+def usp_attn_forward(self, x, freqs):
+    q = self.norm_q(self.q(x))
+    k = self.norm_k(self.k(x))
+    v = self.v(x)
+    q = rope_apply(q, freqs, self.num_heads)
+    k = rope_apply(k, freqs, self.num_heads)
+    q = rearrange(q, "b s (n d) -> b s n d", n=self.num_heads)
+    k = rearrange(k, "b s (n d) -> b s n d", n=self.num_heads)
+    v = rearrange(v, "b s (n d) -> b s n d", n=self.num_heads)
+    x = xFuserLongContextAttention()(
+        None,
+        query=q,
+        key=k,
+        value=v,
+    )
+    x = x.flatten(2)
+    del q, k, v
+    torch.cuda.empty_cache()
+    return self.o(x)

diffsynth/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model_manager import *

diffsynth/models/downloader.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import shutil
+from typing import List
+from huggingface_hub import hf_hub_download
+from modelscope import snapshot_download
+from typing_extensions import Literal, TypeAlias
+from ..configs.model_config import (Preset_model_id,
+                                    preset_models_on_huggingface,
+                                    preset_models_on_modelscope)
+def download_from_modelscope(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        snapshot_download(model_id, allow_file_pattern=origin_file_path, local_dir=local_dir)
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, os.path.split(origin_file_path)[-1])
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+def download_from_huggingface(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    file_name = os.path.basename(origin_file_path)
+    if file_name in os.listdir(local_dir):
+        print(f"    {file_name} has been already in {local_dir}.")
+    else:
+        print(f"    Start downloading {os.path.join(local_dir, file_name)}")
+        hf_hub_download(model_id, origin_file_path, local_dir=local_dir)
+        downloaded_file_path = os.path.join(local_dir, origin_file_path)
+        target_file_path = os.path.join(local_dir, file_name)
+        if downloaded_file_path != target_file_path:
+            shutil.move(downloaded_file_path, target_file_path)
+            shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+Preset_model_website: TypeAlias = Literal[
+    "HuggingFace",
+    "ModelScope",
+]
+website_to_preset_models = {
+    "HuggingFace": preset_models_on_huggingface,
+    "ModelScope": preset_models_on_modelscope,
+}
+website_to_download_fn = {
+    "HuggingFace": download_from_huggingface,
+    "ModelScope": download_from_modelscope,
+}
+def download_customized_models(
+    model_id,
+    origin_file_path,
+    local_dir,
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    downloaded_files = []
+    for website in downloading_priority:
+        # Check if the file is downloaded.
+        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+        if file_to_download in downloaded_files:
+            continue
+        # Download
+        website_to_download_fn[website](model_id, origin_file_path, local_dir)
+        if os.path.basename(origin_file_path) in os.listdir(local_dir):
+            downloaded_files.append(file_to_download)
+    return downloaded_files
+def download_models(
+    model_id_list: List[Preset_model_id] = [],
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    print(f"Downloading models: {model_id_list}")
+    downloaded_files = []
+    load_files = []
+    for model_id in model_id_list:
+        for website in downloading_priority:
+            if model_id in website_to_preset_models[website]:
+                # Parse model metadata
+                model_metadata = website_to_preset_models[website][model_id]
+                if isinstance(model_metadata, list):
+                    file_data = model_metadata
+                else:
+                    file_data = model_metadata.get("file_list", [])
+                # Try downloading the model from this website.
+                model_files = []
+                for model_id, origin_file_path, local_dir in file_data:
+                    # Check if the file is downloaded.
+                    file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+                    if file_to_download in downloaded_files:
+                        continue
+                    # Download
+                    website_to_download_fn[website](model_id, origin_file_path, local_dir)
+                    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+                        downloaded_files.append(file_to_download)
+                        model_files.append(file_to_download)
+                # If the model is successfully downloaded, break.
+                if len(model_files) > 0:
+                    if isinstance(model_metadata, dict) and "load_path" in model_metadata:
+                        model_files = model_metadata["load_path"]
+                    load_files.extend(model_files)
+                    break
+    return load_files

diffsynth/models/model_manager.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import importlib
+import json
+import os
+from typing import List
+import torch
+from ..configs.model_config import (huggingface_model_loader_configs,
+                                    model_loader_configs,
+                                    patch_model_loader_configs)
+from .downloader import (Preset_model_id, Preset_model_website,
+                         download_customized_models, download_models)
+from .utils import (hash_state_dict_keys, init_weights_on_device,
+                    load_state_dict, split_state_dict_with_prefix)
+def load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        print(f"    model_name: {model_name} model_class: {model_class.__name__}")
+        state_dict_converter = model_class.state_dict_converter()
+        if model_resource == "civitai":
+            state_dict_results = state_dict_converter.from_civitai(state_dict)
+        elif model_resource == "diffusers":
+            state_dict_results = state_dict_converter.from_diffusers(state_dict)
+        if isinstance(state_dict_results, tuple):
+            model_state_dict, extra_kwargs = state_dict_results
+            print(f"        This model is initialized with extra kwargs: {extra_kwargs}")
+        else:
+            model_state_dict, extra_kwargs = state_dict_results, {}
+        torch_dtype = torch.float32 if extra_kwargs.get("upcast_to_float32", False) else torch_dtype
+        with init_weights_on_device():
+            model = model_class(**extra_kwargs)
+        if hasattr(model, "eval"):
+            model = model.eval()
+        model.load_state_dict(model_state_dict, assign=True)
+        model = model.to(dtype=torch_dtype, device=device)
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_model_from_huggingface_folder(file_path, model_names, model_classes, torch_dtype, device):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        if torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            model = model_class.from_pretrained(file_path, torch_dtype=torch_dtype).eval()
+        else:
+            model = model_class.from_pretrained(file_path).eval().to(dtype=torch_dtype)
+        if torch_dtype == torch.float16 and hasattr(model, "half"):
+            model = model.half()
+        try:
+            model = model.to(device=device)
+        except:
+            pass
+        loaded_model_names.append(model_name)
+        loaded_models.append(model)
+    return loaded_model_names, loaded_models
+def load_single_patch_model_from_single_file(state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device):
+    print(f"    model_name: {model_name} model_class: {model_class.__name__} extra_kwargs: {extra_kwargs}")
+    base_state_dict = base_model.state_dict()
+    base_model.to("cpu")
+    del base_model
+    model = model_class(**extra_kwargs)
+    model.load_state_dict(base_state_dict, strict=False)
+    model.load_state_dict(state_dict, strict=False)
+    model.to(dtype=torch_dtype, device=device)
+    return model
+def load_patch_model_from_single_file(state_dict, model_names, model_classes, extra_kwargs, model_manager, torch_dtype, device):
+    loaded_model_names, loaded_models = [], []
+    for model_name, model_class in zip(model_names, model_classes):
+        while True:
+            for model_id in range(len(model_manager.model)):
+                base_model_name = model_manager.model_name[model_id]
+                if base_model_name == model_name:
+                    base_model_path = model_manager.model_path[model_id]
+                    base_model = model_manager.model[model_id]
+                    print(f"    Adding patch model to {base_model_name} ({base_model_path})")
+                    patched_model = load_single_patch_model_from_single_file(
+                        state_dict, model_name, model_class, base_model, extra_kwargs, torch_dtype, device)
+                    loaded_model_names.append(base_model_name)
+                    loaded_models.append(patched_model)
+                    model_manager.model.pop(model_id)
+                    model_manager.model_path.pop(model_id)
+                    model_manager.model_name.pop(model_id)
+                    break
+            else:
+                break
+    return loaded_model_names, loaded_models
+class ModelDetectorTemplate:
+    def __init__(self):
+        pass
+    def match(self, file_path="", state_dict={}):
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        return [], []
+class ModelDetectorFromSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        self.keys_hash_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(self, keys_hash, keys_hash_with_shape, model_names, model_classes, model_resource):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (model_names, model_classes, model_resource)
+        if keys_hash is not None:
+            self.keys_hash_dict[keys_hash] = (model_names, model_classes, model_resource)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, model_resource = self.keys_hash_with_shape_dict[keys_hash_with_shape]
+            loaded_model_names, loaded_models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device)
+            return loaded_model_names, loaded_models
+        # Load models without strict matching
+        # (the shape of parameters may be inconsistent, and the state_dict_converter will modify the model architecture)
+        keys_hash = hash_state_dict_keys(state_dict, with_shape=False)
+        if keys_hash in self.keys_hash_dict:
+            model_names, model_classes, model_resource = self.keys_hash_dict[keys_hash]
+            loaded_model_names, loaded_models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, torch_dtype, device)
+            return loaded_model_names, loaded_models
+        return loaded_model_names, loaded_models
+class ModelDetectorFromSplitedSingleFile(ModelDetectorFromSingleFile):
+    def __init__(self, model_loader_configs=[]):
+        super().__init__(model_loader_configs)
+    def match(self, file_path="", state_dict={}):
+        if isinstance(file_path, str) and os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        # Split the state_dict and load from each component
+        splited_state_dict = split_state_dict_with_prefix(state_dict)
+        valid_state_dict = {}
+        for sub_state_dict in splited_state_dict:
+            if super().match(file_path, sub_state_dict):
+                valid_state_dict.update(sub_state_dict)
+        if super().match(file_path, valid_state_dict):
+            loaded_model_names, loaded_models = super().load(file_path, valid_state_dict, device, torch_dtype)
+        else:
+            loaded_model_names, loaded_models = [], []
+            for sub_state_dict in splited_state_dict:
+                if super().match(file_path, sub_state_dict):
+                    loaded_model_names_, loaded_models_ = super().load(file_path, valid_state_dict, device, torch_dtype)
+                    loaded_model_names += loaded_model_names_
+                    loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromHuggingfaceFolder:
+    def __init__(self, model_loader_configs=[]):
+        self.architecture_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(self, architecture, huggingface_lib, model_name, redirected_architecture):
+        self.architecture_dict[architecture] = (huggingface_lib, model_name, redirected_architecture)
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isfile(file_path):
+            return False
+        file_list = os.listdir(file_path)
+        if "config.json" not in file_list:
+            return False
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        if "architectures" not in config and "_class_name" not in config:
+            return False
+        return True
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, **kwargs):
+        with open(os.path.join(file_path, "config.json"), "r") as f:
+            config = json.load(f)
+        loaded_model_names, loaded_models = [], []
+        architectures = config["architectures"] if "architectures" in config else [config["_class_name"]]
+        for architecture in architectures:
+            huggingface_lib, model_name, redirected_architecture = self.architecture_dict[architecture]
+            if redirected_architecture is not None:
+                architecture = redirected_architecture
+            model_class = importlib.import_module(huggingface_lib).__getattribute__(architecture)
+            loaded_model_names_, loaded_models_ = load_model_from_huggingface_folder(file_path, [model_name], [model_class], torch_dtype, device)
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelDetectorFromPatchedSingleFile:
+    def __init__(self, model_loader_configs=[]):
+        self.keys_hash_with_shape_dict = {}
+        for metadata in model_loader_configs:
+            self.add_model_metadata(*metadata)
+    def add_model_metadata(self, keys_hash_with_shape, model_name, model_class, extra_kwargs):
+        self.keys_hash_with_shape_dict[keys_hash_with_shape] = (model_name, model_class, extra_kwargs)
+    def match(self, file_path="", state_dict={}):
+        if not isinstance(file_path, str) or os.path.isdir(file_path):
+            return False
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            return True
+        return False
+    def load(self, file_path="", state_dict={}, device="cuda", torch_dtype=torch.float16, model_manager=None, **kwargs):
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        # Load models with strict matching
+        loaded_model_names, loaded_models = [], []
+        keys_hash_with_shape = hash_state_dict_keys(state_dict, with_shape=True)
+        if keys_hash_with_shape in self.keys_hash_with_shape_dict:
+            model_names, model_classes, extra_kwargs = self.keys_hash_with_shape_dict[keys_hash_with_shape]
+            loaded_model_names_, loaded_models_ = load_patch_model_from_single_file(
+                state_dict, model_names, model_classes, extra_kwargs, model_manager, torch_dtype, device)
+            loaded_model_names += loaded_model_names_
+            loaded_models += loaded_models_
+        return loaded_model_names, loaded_models
+class ModelManager:
+    def __init__(
+        self,
+        torch_dtype=torch.float16,
+        device="cuda",
+        model_id_list: List[Preset_model_id] = [],
+        downloading_priority: List[Preset_model_website] = [ "HuggingFace","ModelScope"],
+        file_path_list: List[str] = [],
+    ):
+        self.torch_dtype = torch_dtype
+        self.device = device
+        self.model = []
+        self.model_path = []
+        self.model_name = []
+        downloaded_files = download_models(model_id_list, downloading_priority) if len(model_id_list) > 0 else []
+        self.model_detector = [
+            ModelDetectorFromSingleFile(model_loader_configs),
+            ModelDetectorFromSplitedSingleFile(model_loader_configs),
+            ModelDetectorFromHuggingfaceFolder(huggingface_model_loader_configs),
+            ModelDetectorFromPatchedSingleFile(patch_model_loader_configs),
+        ]
+        self.load_models(downloaded_files + file_path_list)
+    def load_model_from_single_file(self, file_path="", state_dict={}, model_names=[], model_classes=[], model_resource=None):
+        # print(f"Loading models from file: {file_path}")
+        if len(state_dict) == 0:
+            state_dict = load_state_dict(file_path)
+        model_names, models = load_model_from_single_file(state_dict, model_names, model_classes, model_resource, self.torch_dtype, self.device)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_model_from_huggingface_folder(self, file_path="", model_names=[], model_classes=[]):
+        # print(f"Loading models from folder: {file_path}")
+        model_names, models = load_model_from_huggingface_folder(file_path, model_names, model_classes, self.torch_dtype, self.device)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following models are loaded: {model_names}.")
+    def load_patch_model_from_single_file(self, file_path="", state_dict={}, model_names=[], model_classes=[], extra_kwargs={}):
+        print(f"Loading patch models from file: {file_path}")
+        model_names, models = load_patch_model_from_single_file(
+            state_dict, model_names, model_classes, extra_kwargs, self, self.torch_dtype, self.device)
+        for model_name, model in zip(model_names, models):
+            self.model.append(model)
+            self.model_path.append(file_path)
+            self.model_name.append(model_name)
+        print(f"    The following patched models are loaded: {model_names}.")
+    def load_lora(self, file_path="", state_dict={}, lora_alpha=1.0):
+        if isinstance(file_path, list):
+            for file_path_ in file_path:
+                self.load_lora(file_path_, state_dict=state_dict, lora_alpha=lora_alpha)
+        else:
+            print(f"Loading LoRA models from file: {file_path}")
+            is_loaded = False
+            if len(state_dict) == 0:
+                state_dict = load_state_dict(file_path)
+            for model_name, model, model_path in zip(self.model_name, self.model, self.model_path):
+                for lora in get_lora_loaders():
+                    match_results = lora.match(model, state_dict)
+                    if match_results is not None:
+                        print(f"    Adding LoRA to {model_name} ({model_path}).")
+                        lora_prefix, model_resource = match_results
+                        lora.load(model, state_dict, lora_prefix, alpha=lora_alpha, model_resource=model_resource)
+                        is_loaded = True
+                        break
+            if not is_loaded:
+                print(f"    Cannot load LoRA: {file_path}")
+    def load_model(self, file_path, model_names=None, device=None, torch_dtype=None):
+        # print(f"Loading models from: {file_path}")
+        if device is None: device = self.device
+        if torch_dtype is None: torch_dtype = self.torch_dtype
+        if isinstance(file_path, list):
+            state_dict = {}
+            for path in file_path:
+                state_dict.update(load_state_dict(path))
+        elif os.path.isfile(file_path):
+            state_dict = load_state_dict(file_path)
+        else:
+            state_dict = None
+        for model_detector in self.model_detector:
+            if model_detector.match(file_path, state_dict):
+                model_names, models = model_detector.load(
+                    file_path, state_dict,
+                    device=device, torch_dtype=torch_dtype,
+                    allowed_model_names=model_names, model_manager=self
+                )
+                for model_name, model in zip(model_names, models):
+                    self.model.append(model)
+                    self.model_path.append(file_path)
+                    self.model_name.append(model_name)
+                print(f"    The following models are loaded: {model_names}.")
+                break
+        else:
+            print(f"    We cannot detect the model type. No models are loaded.")
+    def load_models(self, file_path_list, model_names=None, device=None, torch_dtype=None):
+        for file_path in file_path_list:
+            self.load_model(file_path, model_names, device=device, torch_dtype=torch_dtype)
+    def fetch_model(self, model_name, file_path=None, require_model_path=False):
+        fetched_models = []
+        fetched_model_paths = []
+        for model, model_path, model_name_ in zip(self.model, self.model_path, self.model_name):
+            if file_path is not None and file_path != model_path:
+                continue
+            if model_name == model_name_:
+                fetched_models.append(model)
+                fetched_model_paths.append(model_path)
+        if len(fetched_models) == 0:
+            print(f"No {model_name} models available.")
+            return None
+        if len(fetched_models) == 1:
+            pass
+            # print(f"Using {model_name} from {fetched_model_paths[0]}.")
+        else:
+            print(f"More than one {model_name} models are loaded in model manager: {fetched_model_paths}. Using {model_name} from {fetched_model_paths[0]}.")
+        if require_model_path:
+            return fetched_models[0], fetched_model_paths[0]
+        else:
+            return fetched_models[0]
+    def to(self, device):
+        for model in self.model:
+            model.to(device)

diffsynth/models/tiler.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import torch
+from einops import rearrange, repeat
+class TileWorker:
+    def __init__(self):
+        pass
+    def mask(self, height, width, border_width):
+        # Create a mask with shape (height, width).
+        # The centre area is filled with 1, and the border line is filled with values in range (0, 1].
+        x = torch.arange(height).repeat(width, 1).T
+        y = torch.arange(width).repeat(height, 1)
+        mask = torch.stack([x + 1, height - x, y + 1, width - y]).min(dim=0).values
+        mask = (mask / border_width).clip(0, 1)
+        return mask
+    def tile(self, model_input, tile_size, tile_stride, tile_device, tile_dtype):
+        # Convert a tensor (b, c, h, w) to (b, c, tile_size, tile_size, tile_num)
+        batch_size, channel, _, _ = model_input.shape
+        model_input = model_input.to(device=tile_device, dtype=tile_dtype)
+        unfold_operator = torch.nn.Unfold(
+            kernel_size=(tile_size, tile_size),
+            stride=(tile_stride, tile_stride)
+        )
+        model_input = unfold_operator(model_input)
+        model_input = model_input.view((batch_size, channel, tile_size, tile_size, -1))
+        return model_input
+    def tiled_inference(self, forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype):
+        # Call y=forward_fn(x) for each tile
+        tile_num = model_input.shape[-1]
+        model_output_stack = []
+        for tile_id in range(0, tile_num, tile_batch_size):
+            # process input
+            tile_id_ = min(tile_id + tile_batch_size, tile_num)
+            x = model_input[:, :, :, :, tile_id: tile_id_]
+            x = x.to(device=inference_device, dtype=inference_dtype)
+            x = rearrange(x, "b c h w n -> (n b) c h w")
+            # process output
+            y = forward_fn(x)
+            y = rearrange(y, "(n b) c h w -> b c h w n", n=tile_id_-tile_id)
+            y = y.to(device=tile_device, dtype=tile_dtype)
+            model_output_stack.append(y)
+        model_output = torch.concat(model_output_stack, dim=-1)
+        return model_output
+    def io_scale(self, model_output, tile_size):
+        # Determine the size modification happened in forward_fn
+        # We only consider the same scale on height and width.
+        io_scale = model_output.shape[2] / tile_size
+        return io_scale
+    def untile(self, model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype):
+        # The reversed function of tile
+        mask = self.mask(tile_size, tile_size, border_width)
+        mask = mask.to(device=tile_device, dtype=tile_dtype)
+        mask = rearrange(mask, "h w -> 1 1 h w 1")
+        model_output = model_output * mask
+        fold_operator = torch.nn.Fold(
+            output_size=(height, width),
+            kernel_size=(tile_size, tile_size),
+            stride=(tile_stride, tile_stride)
+        )
+        mask = repeat(mask[0, 0, :, :, 0], "h w -> 1 (h w) n", n=model_output.shape[-1])
+        model_output = rearrange(model_output, "b c h w n -> b (c h w) n")
+        model_output = fold_operator(model_output) / fold_operator(mask)
+        return model_output
+    def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_batch_size=1, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
+        # Prepare
+        inference_device, inference_dtype = model_input.device, model_input.dtype
+        height, width = model_input.shape[2], model_input.shape[3]
+        border_width = int(tile_stride*0.5) if border_width is None else border_width
+        # tile
+        model_input = self.tile(model_input, tile_size, tile_stride, tile_device, tile_dtype)
+        # inference
+        model_output = self.tiled_inference(forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype)
+        # resize
+        io_scale = self.io_scale(model_output, tile_size)
+        height, width = int(height*io_scale), int(width*io_scale)
+        tile_size, tile_stride = int(tile_size*io_scale), int(tile_stride*io_scale)
+        border_width = int(border_width*io_scale)
+        # untile
+        model_output = self.untile(model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype)
+        # Done!
+        model_output = model_output.to(device=inference_device, dtype=inference_dtype)
+        return model_output
+class FastTileWorker:
+    def __init__(self):
+        pass
+    def build_mask(self, data, is_bound):
+        _, _, H, W = data.shape
+        h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
+        w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
+        border_width = (H + W) // 4
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else h + 1,
+            pad if is_bound[1] else H - h,
+            pad if is_bound[2] else w + 1,
+            pad if is_bound[3] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
+        mask = rearrange(mask, "H W -> 1 H W")
+        return mask
+    def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
+        # Prepare
+        B, C, H, W = model_input.shape
+        border_width = int(tile_stride*0.5) if border_width is None else border_width
+        weight = torch.zeros((1, 1, H, W), dtype=tile_dtype, device=tile_device)
+        values = torch.zeros((B, C, H, W), dtype=tile_dtype, device=tile_device)
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride):
+            for w in range(0, W, tile_stride):
+                if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
+                    continue
+                h_, w_ = h + tile_size, w + tile_size
+                if h_ > H: h, h_ = H - tile_size, H
+                if w_ > W: w, w_ = W - tile_size, W
+                tasks.append((h, h_, w, w_))
+        # Run
+        for hl, hr, wl, wr in tasks:
+            # Forward
+            hidden_states_batch = forward_fn(hl, hr, wl, wr).to(dtype=tile_dtype, device=tile_device)
+            mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
+            values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
+            weight[:, :, hl:hr, wl:wr] += mask
+        values /= weight
+        return values
+class TileWorker2Dto3D:
+    """
+    Process 3D tensors, but only enable TileWorker on 2D.
+    """
+    def __init__(self):
+        pass
+    def build_mask(self, T, H, W, dtype, device, is_bound, border_width):
+        t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
+        h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
+        w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
+        border_width = (H + W) // 4 if border_width is None else border_width
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else t + 1,
+            pad if is_bound[1] else T - t,
+            pad if is_bound[2] else h + 1,
+            pad if is_bound[3] else H - h,
+            pad if is_bound[4] else w + 1,
+            pad if is_bound[5] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=dtype, device=device)
+        mask = rearrange(mask, "T H W -> 1 1 T H W")
+        return mask
+    def tiled_forward(
+        self,
+        forward_fn,
+        model_input,
+        tile_size, tile_stride,
+        tile_device="cpu", tile_dtype=torch.float32,
+        computation_device="cuda", computation_dtype=torch.float32,
+        border_width=None, scales=[1, 1, 1, 1],
+        progress_bar=lambda x:x
+    ):
+        B, C, T, H, W = model_input.shape
+        scale_C, scale_T, scale_H, scale_W = scales
+        tile_size_H, tile_size_W = tile_size
+        tile_stride_H, tile_stride_W = tile_stride
+        value = torch.zeros((B, int(C*scale_C), int(T*scale_T), int(H*scale_H), int(W*scale_W)), dtype=tile_dtype, device=tile_device)
+        weight = torch.zeros((1, 1, int(T*scale_T), int(H*scale_H), int(W*scale_W)), dtype=tile_dtype, device=tile_device)
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride_H):
+            for w in range(0, W, tile_stride_W):
+                if (h-tile_stride_H >= 0 and h-tile_stride_H+tile_size_H >= H) or (w-tile_stride_W >= 0 and w-tile_stride_W+tile_size_W >= W):
+                    continue
+                h_, w_ = h + tile_size_H, w + tile_size_W
+                if h_ > H: h, h_ = max(H - tile_size_H, 0), H
+                if w_ > W: w, w_ = max(W - tile_size_W, 0), W
+                tasks.append((h, h_, w, w_))
+        # Run
+        for hl, hr, wl, wr in progress_bar(tasks):
+            mask = self.build_mask(
+                int(T*scale_T), int((hr-hl)*scale_H), int((wr-wl)*scale_W),
+                tile_dtype, tile_device,
+                is_bound=(True, True, hl==0, hr>=H, wl==0, wr>=W),
+                border_width=border_width
+            )
+            grid_input = model_input[:, :, :, hl:hr, wl:wr].to(dtype=computation_dtype, device=computation_device)
+            grid_output = forward_fn(grid_input).to(dtype=tile_dtype, device=tile_device)
+            value[:, :, :, int(hl*scale_H):int(hr*scale_H), int(wl*scale_W):int(wr*scale_W)] += grid_output * mask
+            weight[:, :, :, int(hl*scale_H):int(hr*scale_H), int(wl*scale_W):int(wr*scale_W)] += mask
+        value = value / weight
+        return value

diffsynth/models/utils.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import hashlib
+import os
+from contextlib import contextmanager
+import torch
+from safetensors import safe_open
+@contextmanager
+def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+def load_state_dict_from_folder(file_path, torch_dtype=None):
+    state_dict = {}
+    for file_name in os.listdir(file_path):
+        if "." in file_name and file_name.split(".")[-1] in [
+            "safetensors", "bin", "ckpt", "pth", "pt"
+        ]:
+            state_dict.update(load_state_dict(os.path.join(file_path, file_name), torch_dtype=torch_dtype))
+    return state_dict
+def load_state_dict(file_path, torch_dtype=None, device="cpu"):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
+def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device=device) as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
+    state_dict = torch.load(file_path, map_location=device, weights_only=True)
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def search_for_embeddings(state_dict):
+    embeddings = []
+    for k in state_dict:
+        if isinstance(state_dict[k], torch.Tensor):
+            embeddings.append(state_dict[k])
+        elif isinstance(state_dict[k], dict):
+            embeddings += search_for_embeddings(state_dict[k])
+    return embeddings
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-3:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-3:
+                    return name
+    return None
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)
+def search_for_files(folder, extensions):
+    files = []
+    if os.path.isdir(folder):
+        for file in sorted(os.listdir(folder)):
+            files += search_for_files(os.path.join(folder, file), extensions)
+    elif os.path.isfile(folder):
+        for extension in extensions:
+            if folder.endswith(extension):
+                files.append(folder)
+                break
+    return files
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def split_state_dict_with_prefix(state_dict):
+    keys = sorted([key for key in state_dict if isinstance(key, str)])
+    prefix_dict = {}
+    for key in  keys:
+        prefix = key if "." not in key else key.split(".")[0]
+        if prefix not in prefix_dict:
+            prefix_dict[prefix] = []
+        prefix_dict[prefix].append(key)
+    state_dicts = []
+    for prefix, keys in prefix_dict.items():
+        sub_state_dict = {key: state_dict[key] for key in keys}
+        state_dicts.append(sub_state_dict)
+    return state_dicts
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()

diffsynth/models/wan_video_camera_controller.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+import os
+from typing_extensions import Literal
+class SimpleAdapter(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, stride, num_residual_blocks=1):
+        super(SimpleAdapter, self).__init__()
+        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
+        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=8)
+        # Convolution: reduce spatial dimensions by a factor
+        #  of 2 (without overlap)
+        self.conv = nn.Conv2d(in_dim * 64, out_dim,
+                              kernel_size=kernel_size, stride=stride, padding=0)
+        # Residual blocks for feature extraction
+        self.residual_blocks = nn.Sequential(
+            *[ResidualBlock(out_dim) for _ in range(num_residual_blocks)]
+        )
+    def forward(self, x):
+        # Reshape to merge the frame dimension into batch
+        bs, c, f, h, w = x.size()
+        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
+        # Pixel Unshuffle operation
+        x_unshuffled = self.pixel_unshuffle(x)
+        # Convolution operation
+        x_conv = self.conv(x_unshuffled)
+        # Feature extraction with residual blocks
+        out = self.residual_blocks(x_conv)
+        # Reshape to restore original bf dimension
+        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
+        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
+        out = out.permute(0, 2, 1, 3, 4)
+        return out
+    def process_camera_coordinates(
+        self,
+        direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"],
+        length: int,
+        height: int,
+        width: int,
+        speed: float = 1/54,
+        origin=(0, 0.532139961, 0.946026558, 0.5, 0.5,
+                0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+    ):
+        if origin is None:
+            origin = (0, 0.532139961, 0.946026558, 0.5, 0.5,
+                      0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+        print(
+            f"Generating camera coordinates with direction: {direction}, length: {length}, speed: {speed}, origin: {origin}")
+        coordinates = generate_camera_coordinates(
+            direction, length, speed, origin)
+        print(f"Generated {len(coordinates)} camera coordinates.")
+        plucker_embedding = process_pose_file(coordinates, width, height)
+        print(
+            f"Processed camera coordinates into plucker embedding with shape: {plucker_embedding.shape}")
+        return plucker_embedding
+class ResidualBlock(nn.Module):
+    def __init__(self, dim):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
+    def forward(self, x):
+        residual = x
+        out = self.relu(self.conv1(x))
+        out = self.conv2(out)
+        out += residual
+        return out
+class Camera(object):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def get_relative_pose(cam_params):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + \
+        [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def custom_meshgrid(*args):
+    # torch>=2.0.0 only
+    return torch.meshgrid(*args, indexing='ij')
+def ray_condition(K, c2w, H, W, device):
+    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
+    """
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B = K.shape[0]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
+    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
+    zs = torch.ones_like(i)  # [B, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
+    directions = directions / \
+        directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
+    rays_o = c2w[..., :3, 3]  # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
+    # c2w @ dirctions
+    rays_dxo = torch.linalg.cross(rays_o, rays_d)
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+def process_pose_file(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu', return_poses=False):
+    if return_poses:
+        return cam_params
+    else:
+        cam_params = [Camera(cam_param) for cam_param in cam_params]
+        sample_wh_ratio = width / height
+        # Assuming placeholder ratios, change as needed
+        pose_wh_ratio = original_pose_width / original_pose_height
+        if pose_wh_ratio > sample_wh_ratio:
+            resized_ori_w = height * pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fx = resized_ori_w * cam_param.fx / width
+        else:
+            resized_ori_h = width / pose_wh_ratio
+            for cam_param in cam_params:
+                cam_param.fy = resized_ori_h * cam_param.fy / height
+        intrinsic = np.asarray([[cam_param.fx * width,
+                                cam_param.fy * height,
+                                cam_param.cx * width,
+                                cam_param.cy * height]
+                                for cam_param in cam_params], dtype=np.float32)
+        K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
+        # Assuming this function is defined elsewhere
+        c2ws = get_relative_pose(cam_params)
+        c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
+        plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[
+            0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
+        plucker_embedding = plucker_embedding[None]
+        plucker_embedding = rearrange(
+            plucker_embedding, "b f c h w -> b f h w c")[0]
+        return plucker_embedding
+def generate_camera_coordinates(
+    direction: Literal["Left", "Right", "Up", "Down", "LeftUp", "LeftDown", "RightUp", "RightDown"],
+    length: int,
+    speed: float = 1/54,
+    origin=(0, 0.532139961, 0.946026558, 0.5, 0.5,
+            0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0)
+):
+    coordinates = [list(origin)]
+    while len(coordinates) < length:
+        coor = coordinates[-1].copy()
+        if "Left" in direction:
+            coor[9] += speed
+        if "Right" in direction:
+            coor[9] -= speed
+        if "Up" in direction:
+            coor[13] += speed
+        if "Down" in direction:
+            coor[13] -= speed
+        coordinates.append(coor)
+    return coordinates

diffsynth/models/wan_video_dit.py ADDED Viewed

	@@ -0,0 +1,974 @@

+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.lora import LoRALinearLayer
+from einops import rearrange
+from .utils import hash_state_dict_keys
+from .wan_video_camera_controller import SimpleAdapter
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+    print(f"========= Using sage attention, please note that this is for inference speed up only!==========")
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+def flash_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    num_heads: int,
+    compatibility_mode=False,
+):
+    if compatibility_mode:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_3_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn_interface.flash_attn_func(q, k, v)
+        if isinstance(x, tuple):
+            x = x[0]
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif FLASH_ATTN_2_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b s n d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b s n d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b s n d", n=num_heads)
+        x = flash_attn.flash_attn_func(q, k, v)
+        x = rearrange(x, "b s n d -> b s (n d)", n=num_heads)
+    elif SAGE_ATTN_AVAILABLE:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = sageattn(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    else:
+        q = rearrange(q, "b s (n d) -> b n s d", n=num_heads)
+        k = rearrange(k, "b s (n d) -> b n s d", n=num_heads)
+        v = rearrange(v, "b s (n d) -> b n s d", n=num_heads)
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "b n s d -> b s (n d)", n=num_heads)
+    return x
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
+    return x * (1 + scale) + shift
+def sinusoidal_embedding_1d(dim, position):
+    sinusoid = torch.outer(
+        position.type(torch.float64),
+        torch.pow(
+            10000,
+            -torch.arange(dim // 2, dtype=torch.float64, device=position.device).div(
+                dim // 2
+            ),
+        ),
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x.to(position.dtype)
+def precompute_freqs_cis_3d(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 3d rope precompute
+    f_freqs_cis = precompute_freqs_cis(dim - 2 * (dim // 3), end, theta)
+    h_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    w_freqs_cis = precompute_freqs_cis(dim // 3, end, theta)
+    return f_freqs_cis, h_freqs_cis, w_freqs_cis
+def precompute_freqs_cis(dim: int, end: int = 1024, theta: float = 10000.0):
+    # 1d rope precompute
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)
+                   [: (dim // 2)].double() / dim))
+    freqs = torch.outer(torch.arange(end, device=freqs.device), freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def rope_apply(x, freqs, num_heads):
+    x = rearrange(x, "b s (n d) -> b s n d", n=num_heads)
+    x_out = torch.view_as_complex(
+        x.to(torch.float64).reshape(x.shape[0], x.shape[1], x.shape[2], -1, 2)
+    )
+    x_out = torch.view_as_real(x_out * freqs).flatten(2)
+    return x_out.to(x.dtype)
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        dtype = x.dtype
+        # print(f"x device: {x.device}, weight device: {self.weight.device}")
+        return self.norm(x.float()).to(dtype) * self.weight
+class AttentionModule(nn.Module):
+    def __init__(self, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+    def forward(self, q, k, v):
+        x = flash_attention(q=q, k=k, v=v, num_heads=self.num_heads)
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+    def forward(self, x, freqs):
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(x))
+        v = self.v(x)
+        q = rope_apply(q, freqs, self.num_heads)
+        k = rope_apply(k, freqs, self.num_heads)
+        x = self.attn(q, k, v)
+        return self.o(x)
+class CrossAttention(nn.Module):
+    def __init__(
+        self, dim: int, num_heads: int, eps: float = 1e-6, has_image_input: bool = False
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim)
+            self.v_img = nn.Linear(dim, dim)
+            self.norm_k_img = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        x = self.attn(q, k, v)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = flash_attention(q, k_img, v_img, num_heads=self.num_heads)
+            x = x + y
+        return self.o(x)
+class SelfAttentionSeparate(nn.Module):
+    def __init__(self, dim: int, num_heads: int, eps: float = 1e-6, rank=64):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        if rank > 0:
+            # LoraLinear
+            self.q_zl_before = LoRALinearLayer(dim, dim, rank=rank)
+            self.k_zl_before = LoRALinearLayer(dim, dim, rank=rank)
+            self.v_zl_before = LoRALinearLayer(dim, dim, rank=rank)
+            self.q_zl_after = LoRALinearLayer(dim, dim, rank=rank)
+            self.k_zl_after = LoRALinearLayer(dim, dim, rank=rank)
+            self.v_zl_after = LoRALinearLayer(dim, dim, rank=rank)
+        else:
+            # Normal Linear
+            self.q_zl_before = nn.Linear(dim, dim)
+            self.k_zl_before = nn.Linear(dim, dim)
+            self.v_zl_before = nn.Linear(dim, dim)
+            self.q_zl_after = nn.Linear(dim, dim)
+            self.k_zl_after = nn.Linear(dim, dim)
+            self.v_zl_after = nn.Linear(dim, dim)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+        self.zero_init_linear()
+    def zero_init_linear(self):
+        layers_to_handle = [
+            self.q_zl_before,
+            self.k_zl_before,
+            self.v_zl_before,
+            self.q_zl_after,
+            self.k_zl_after,
+            self.v_zl_after,
+        ]
+        for _layer in layers_to_handle:
+            if isinstance(_layer, nn.Linear):
+                nn.init.zeros_(_layer.weight)
+                if _layer.bias is not None:
+                    nn.init.zeros_(_layer.bias)
+    def forward(self, x, freqs, camera_pose_embedding=None):
+        if camera_pose_embedding is not None:
+            q = self.norm_q(self.q(x))
+            k = self.norm_k(self.k(x))
+            v = self.v(x)
+        # TODO uncomment
+        # ----------------------------------------------------------------
+        else:
+            q = self.norm_q(
+                self.q(x + self.q_zl_before(camera_pose_embedding))
+                + self.q_zl_after(camera_pose_embedding)
+            )
+            k = self.norm_k(
+                self.k(x + self.k_zl_before(camera_pose_embedding))
+                + self.k_zl_after(camera_pose_embedding)
+            )
+            v = self.v(x + self.v_zl_before(camera_pose_embedding)) + self.v_zl_after(
+                camera_pose_embedding
+            )
+        # --------------------------------------------------------------------
+        q = rope_apply(q, freqs, self.num_heads)
+        k = rope_apply(k, freqs, self.num_heads)
+        x = self.attn(q, k, v)
+        return self.o(x)
+class CrossAttentionSeparate(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        eps: float = 1e-6,
+        has_image_input: bool = False,
+        rank=64,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        if rank > 0:
+            # LoraLinear
+            self.q_zl_before = LoRALinearLayer(dim, dim, rank=rank)
+            self.q_zl_after = LoRALinearLayer(dim, dim, rank=rank)
+        else:
+            # Normal linear
+            self.q_zl_before = nn.Linear(dim, dim, bias=False)
+            self.q_zl_after = nn.Linear(dim, dim, bias=False)
+        self.norm_q = RMSNorm(dim, eps=eps)
+        self.norm_k = RMSNorm(dim, eps=eps)
+        self.has_image_input = has_image_input
+        if has_image_input:
+            self.k_img = nn.Linear(dim, dim)
+            self.v_img = nn.Linear(dim, dim)
+            self.norm_k_img = RMSNorm(dim, eps=eps)
+        self.attn = AttentionModule(self.num_heads)
+        self.zero_init_linear()
+    def zero_init_linear(self):
+        layers_to_handle = [
+            self.q_zl_before,
+            self.q_zl_after,
+        ]
+        for _layer in layers_to_handle:
+            if isinstance(_layer, nn.Linear):
+                nn.init.zeros_(_layer.weight)
+                if _layer.bias is not None:
+                    nn.init.zeros_(_layer.bias)
+    def forward(self, x: torch.Tensor, y: torch.Tensor, camera_pose_embedding=None):
+        if self.has_image_input:
+            img = y[:, :257]
+            ctx = y[:, 257:]
+        else:
+            ctx = y
+        # q = self.norm_q(self.q(x))
+        # k = self.norm_k(self.k(ctx))
+        # v = self.v(ctx)
+        # TODO uncomment
+        # -------------------------------------------------
+        q = self.norm_q(
+            self.q(x + self.q_zl_before(camera_pose_embedding))
+            + self.q_zl_after(camera_pose_embedding)
+        )
+        k = self.norm_k(self.k(ctx))
+        v = self.v(ctx)
+        # -------------------------------------------------
+        x = self.attn(q, k, v)
+        if self.has_image_input:
+            k_img = self.norm_k_img(self.k_img(img))
+            v_img = self.v_img(img)
+            y = flash_attention(q, k_img, v_img, num_heads=self.num_heads)
+            x = x + y
+        return self.o(x)
+class GateModule(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def forward(self, x, gate, residual):
+        return x + gate * residual
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        has_image_input: bool,
+        dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.self_attn = SelfAttention(dim, num_heads, eps)
+        self.cross_attn = CrossAttention(
+            dim, num_heads, eps, has_image_input=has_image_input
+        )
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.gate = GateModule()
+    # @torch.compile(mode='max-autotune')
+    def forward(self, x, context, t_mod, freqs, camera_pose_embedding=None):
+        # msa: multi-head self-attention  mlp: multi-layer perceptron
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod
+        ).chunk(6, dim=1)
+        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
+        x = self.gate(x, gate_msa, self.self_attn(input_x, freqs))
+        x = x + self.cross_attn(self.norm3(x), context)
+        input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = self.gate(x, gate_mlp, self.ffn(input_x))
+        return x
+class CameraDiTBlock(nn.Module):
+    def __init__(
+        self,
+        has_image_input: bool,
+        dim: int,
+        num_heads: int,
+        ffn_dim: int,
+        eps: float = 1e-6,
+        camera_lora_rank=64,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.self_attn = SelfAttentionSeparate(
+            dim, num_heads, eps, rank=camera_lora_rank
+        )
+        self.cross_attn = CrossAttentionSeparate(
+            dim, num_heads, eps, has_image_input=has_image_input, rank=camera_lora_rank
+        )
+        self.norm1 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm2 = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.norm3 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(ffn_dim, dim),
+        )
+        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+        self.gate = GateModule()
+    def forward(self, x, context, t_mod, freqs, camera_pose_embedding=None):
+        # msa: multi-head self-attention  mlp: multi-layer perceptron
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod
+        ).chunk(6, dim=1)
+        input_x = modulate(self.norm1(x), shift_msa, scale_msa)
+        # x = self.gate(x, gate_msa, self.self_attn(input_x, freqs))
+        x = self.gate(
+            x, gate_msa, self.self_attn(input_x, freqs, camera_pose_embedding)
+        )
+        x = x + self.cross_attn(self.norm3(x), context, camera_pose_embedding)
+        input_x = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = self.gate(x, gate_mlp, self.ffn(input_x))
+        return x
+class MLP(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, has_pos_emb=False):
+        super().__init__()
+        self.proj = torch.nn.Sequential(
+            nn.LayerNorm(in_dim),
+            nn.Linear(in_dim, in_dim),
+            nn.GELU(),
+            nn.Linear(in_dim, out_dim),
+            nn.LayerNorm(out_dim),
+        )
+        self.has_pos_emb = has_pos_emb
+        if has_pos_emb:
+            self.emb_pos = torch.nn.Parameter(torch.zeros((1, 514, 1280)))
+    def forward(self, x):
+        if self.has_pos_emb:
+            x = x + self.emb_pos.to(dtype=x.dtype, device=x.device)
+        return self.proj(x)
+class Head(nn.Module):
+    def __init__(
+        self, dim: int, out_dim: int, patch_size: Tuple[int, int, int], eps: float
+    ):
+        super().__init__()
+        self.dim = dim
+        self.patch_size = patch_size
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        self.head = nn.Linear(dim, out_dim * math.prod(patch_size))
+        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
+    def forward(self, x, t_mod):
+        shift, scale = (
+            self.modulation.to(dtype=t_mod.dtype, device=t_mod.device) + t_mod
+        ).chunk(2, dim=1)
+        x = self.head(self.norm(x) * (1 + scale) + shift)
+        return x
+class WanModel(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        in_dim: int,
+        ffn_dim: int,
+        out_dim: int,
+        text_dim: int,
+        freq_dim: int,
+        eps: float,
+        patch_size: Tuple[int, int, int],
+        num_heads: int,
+        num_layers: int,
+        has_image_input: bool,
+        has_image_pos_emb: bool = False,
+        has_ref_conv: bool = False,
+        add_control_adapter: bool = False,
+        in_dim_control_adapter: int = 24,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.freq_dim = freq_dim
+        self.has_image_input = has_image_input
+        self.patch_size = patch_size
+        self.patch_embedding = nn.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.text_embedding = nn.Sequential(
+            nn.Linear(text_dim, dim), nn.GELU(
+                approximate="tanh"), nn.Linear(dim, dim)
+        )
+        self.time_embedding = nn.Sequential(
+            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        self.time_projection = nn.Sequential(
+            nn.SiLU(), nn.Linear(dim, dim * 6))
+        self.blocks = nn.ModuleList(
+            [
+                DiTBlock(has_image_input, dim, num_heads, ffn_dim, eps)
+                for _ in range(num_layers)
+            ]
+        )
+        self.head = Head(dim, out_dim, patch_size, eps)
+        head_dim = dim // num_heads
+        self.freqs = precompute_freqs_cis_3d(head_dim)
+        if has_image_input:
+            self.img_emb = MLP(
+                1280, dim, has_pos_emb=has_image_pos_emb
+            )  # clip_feature_dim = 1280
+        if has_ref_conv:
+            self.ref_conv = nn.Conv2d(
+                16, dim, kernel_size=(2, 2), stride=(2, 2))
+        self.has_image_pos_emb = has_image_pos_emb
+        self.has_ref_conv = has_ref_conv
+        self.control_adapter = None
+        self.add_control_adapter = add_control_adapter
+        if add_control_adapter:
+            self.control_adapter = SimpleAdapter(
+                in_dim_control_adapter,
+                dim,
+                kernel_size=patch_size[1:],
+                stride=patch_size[1:],
+            )
+        else:
+            self.control_adapter = None
+    def patchify(
+        self, x: torch.Tensor, control_camera_latents_input: torch.Tensor = None
+    ):
+        x = self.patch_embedding(x)
+        if (
+            self.control_adapter is not None
+            and control_camera_latents_input is not None
+        ):
+            y_camera = self.control_adapter(control_camera_latents_input)
+            x = [u + v for u, v in zip(x, y_camera)]
+            # x = x[0].unsqueeze(0)
+        grid_size = x.shape[2:]
+        x = rearrange(x, "b c f h w -> b (f h w) c").contiguous()
+        return x, grid_size  # x, grid_size: (f, h, w)
+    def unpatchify(self, x: torch.Tensor, grid_size: torch.Tensor):
+        return rearrange(
+            x,
+            "b (f h w) (x y z c) -> b c (f x) (h y) (w z)",
+            f=grid_size[0],
+            h=grid_size[1],
+            w=grid_size[2],
+            x=self.patch_size[0],
+            y=self.patch_size[1],
+            z=self.patch_size[2],
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+        context: torch.Tensor,
+        clip_feature: Optional[torch.Tensor] = None,
+        y: Optional[torch.Tensor] = None,
+        use_gradient_checkpointing: bool = False,
+        use_gradient_checkpointing_offload: bool = False,
+        **kwargs,
+    ):
+        t = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, timestep))
+        t_mod = self.time_projection(t).unflatten(1, (6, self.dim))
+        context = self.text_embedding(context)
+        if self.has_image_input:
+            # print(f"x,y shape: {x.shape}, {y.shape if y is not None else 'None'}")
+            x = torch.cat([x, y], dim=1)  # (b, c_x + c_y, f, h, w)
+            clip_embdding = self.img_emb(clip_feature)
+            context = torch.cat([clip_embdding, context], dim=1)
+        x, (f, h, w) = self.patchify(x)
+        freqs = (
+            torch.cat(
+                [
+                    self.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                    self.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                    self.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+                ],
+                dim=-1,
+            )
+            .reshape(f * h * w, 1, -1)
+            .to(x.device)
+        )
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block in self.blocks:
+            if self.training and use_gradient_checkpointing:
+                if use_gradient_checkpointing_offload:
+                    with torch.autograd.graph.save_on_cpu():
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            x,
+                            context,
+                            t_mod,
+                            freqs,
+                            use_reentrant=False,
+                        )
+                else:
+                    x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        x,
+                        context,
+                        t_mod,
+                        freqs,
+                        use_reentrant=False,
+                    )
+            else:
+                x = block(x, context, t_mod, freqs)
+        x = self.head(x, t)
+        x = self.unpatchify(x, (f, h, w))
+        if hasattr(self, 'rgb_head'):
+            rgb = self.rgb_head(x, t)
+            rgb = self.unpatchify(rgb, (f, h, w))
+            return x, rgb
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanModelStateDictConverter()
+class WanModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "blocks.0.attn1.norm_k.weight": "blocks.0.self_attn.norm_k.weight",
+            "blocks.0.attn1.norm_q.weight": "blocks.0.self_attn.norm_q.weight",
+            "blocks.0.attn1.to_k.bias": "blocks.0.self_attn.k.bias",
+            "blocks.0.attn1.to_k.weight": "blocks.0.self_attn.k.weight",
+            "blocks.0.attn1.to_out.0.bias": "blocks.0.self_attn.o.bias",
+            "blocks.0.attn1.to_out.0.weight": "blocks.0.self_attn.o.weight",
+            "blocks.0.attn1.to_q.bias": "blocks.0.self_attn.q.bias",
+            "blocks.0.attn1.to_q.weight": "blocks.0.self_attn.q.weight",
+            "blocks.0.attn1.to_v.bias": "blocks.0.self_attn.v.bias",
+            "blocks.0.attn1.to_v.weight": "blocks.0.self_attn.v.weight",
+            "blocks.0.attn2.norm_k.weight": "blocks.0.cross_attn.norm_k.weight",
+            "blocks.0.attn2.norm_q.weight": "blocks.0.cross_attn.norm_q.weight",
+            "blocks.0.attn2.to_k.bias": "blocks.0.cross_attn.k.bias",
+            "blocks.0.attn2.to_k.weight": "blocks.0.cross_attn.k.weight",
+            "blocks.0.attn2.to_out.0.bias": "blocks.0.cross_attn.o.bias",
+            "blocks.0.attn2.to_out.0.weight": "blocks.0.cross_attn.o.weight",
+            "blocks.0.attn2.to_q.bias": "blocks.0.cross_attn.q.bias",
+            "blocks.0.attn2.to_q.weight": "blocks.0.cross_attn.q.weight",
+            "blocks.0.attn2.to_v.bias": "blocks.0.cross_attn.v.bias",
+            "blocks.0.attn2.to_v.weight": "blocks.0.cross_attn.v.weight",
+            "blocks.0.ffn.net.0.proj.bias": "blocks.0.ffn.0.bias",
+            "blocks.0.ffn.net.0.proj.weight": "blocks.0.ffn.0.weight",
+            "blocks.0.ffn.net.2.bias": "blocks.0.ffn.2.bias",
+            "blocks.0.ffn.net.2.weight": "blocks.0.ffn.2.weight",
+            "blocks.0.norm2.bias": "blocks.0.norm3.bias",
+            "blocks.0.norm2.weight": "blocks.0.norm3.weight",
+            "blocks.0.scale_shift_table": "blocks.0.modulation",
+            "condition_embedder.text_embedder.linear_1.bias": "text_embedding.0.bias",
+            "condition_embedder.text_embedder.linear_1.weight": "text_embedding.0.weight",
+            "condition_embedder.text_embedder.linear_2.bias": "text_embedding.2.bias",
+            "condition_embedder.text_embedder.linear_2.weight": "text_embedding.2.weight",
+            "condition_embedder.time_embedder.linear_1.bias": "time_embedding.0.bias",
+            "condition_embedder.time_embedder.linear_1.weight": "time_embedding.0.weight",
+            "condition_embedder.time_embedder.linear_2.bias": "time_embedding.2.bias",
+            "condition_embedder.time_embedder.linear_2.weight": "time_embedding.2.weight",
+            "condition_embedder.time_proj.bias": "time_projection.1.bias",
+            "condition_embedder.time_proj.weight": "time_projection.1.weight",
+            "patch_embedding.bias": "patch_embedding.bias",
+            "patch_embedding.weight": "patch_embedding.weight",
+            "scale_shift_table": "head.modulation",
+            "proj_out.bias": "head.head.bias",
+            "proj_out.weight": "head.head.weight",
+        }
+        state_dict_ = {}
+        print(
+            f"hash_state_dict_keys(state_dict): {hash_state_dict_keys(state_dict)}")
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+            else:
+                name_ = ".".join(name.split(
+                    ".")[:1] + ["0"] + name.split(".")[2:])
+                if name_ in rename_dict:
+                    name_ = rename_dict[name_]
+                    name_ = ".".join(
+                        name_.split(".")[:1]
+                        + [name.split(".")[1]]
+                        + name_.split(".")[2:]
+                    )
+                    state_dict_[name_] = param
+        if hash_state_dict_keys(state_dict) == "cb104773c6c2cb6df4f9529ad5c60d0b":
+            config = {
+                "model_type": "t2v",
+                "patch_size": (1, 2, 2),
+                "text_len": 512,
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "window_size": (-1, -1),
+                "qk_norm": True,
+                "cross_attn_norm": True,
+                "eps": 1e-6,
+            }
+        else:
+            config = {}
+        return state_dict_, config
+    def from_civitai(self, state_dict):
+        state_dict = {
+            name: param
+            for name, param in state_dict.items()
+            if not name.startswith("vace")
+        }
+        print(
+            f"hash_state_dict_keys(state_dict): {hash_state_dict_keys(state_dict)} from civitai"
+        )
+        if hash_state_dict_keys(state_dict) == "9269f8db9040a9d860eaca435be61814":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "aafcfd9672c3a2456dc46e1cb6e52c70":
+            config = {
+                "has_image_input": False,
+                "patch_size": [1, 2, 2],
+                "in_dim": 16,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6d6ccde6845b95ad9114ab993d917893":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "6bfcfb3b342cb286ce886889d519a77e":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "349723183fc063b2bfc10bb2835cf677":
+            # 1.3B PAI control
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "efa44cddf936c70abd0ea28b6cbe946c":
+            # 14B PAI control
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+            }
+        elif hash_state_dict_keys(state_dict) == "3ef3b1f8e1dab83d5b71fd7b617f859f":
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 36,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_image_pos_emb": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "70ddad9d3a133785da5ea371aae09504":
+            # 1.3B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "26bde73488a92e64cc20b0a7485b9e5b":
+            # 14B PAI control v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 48,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": True,
+            }
+        elif hash_state_dict_keys(state_dict) == "ac6a5aa74f4a0aab6f64eb9a72f19901":
+            # 1.3B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 1536,
+                "ffn_dim": 8960,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 12,
+                "num_layers": 30,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
+        elif hash_state_dict_keys(state_dict) == "b61c605c2adbd23124d152ed28e049ae":
+            # 14B PAI control-camera v1.1
+            config = {
+                "has_image_input": True,
+                "patch_size": [1, 2, 2],
+                "in_dim": 32,
+                "dim": 5120,
+                "ffn_dim": 13824,
+                "freq_dim": 256,
+                "text_dim": 4096,
+                "out_dim": 16,
+                "num_heads": 40,
+                "num_layers": 40,
+                "eps": 1e-6,
+                "has_ref_conv": False,
+                "add_control_adapter": True,
+                "in_dim_control_adapter": 24,
+            }
+        else:
+            config = {}
+        return state_dict, config

diffsynth/models/wan_video_image_encoder.py ADDED Viewed

	@@ -0,0 +1,902 @@

+"""
+Concise re-implementation of
+``https://github.com/openai/CLIP'' and
+``https://github.com/mlfoundations/open_clip''.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as T
+from .wan_video_dit import flash_attention
+class SelfAttention(nn.Module):
+    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.eps = eps
+        # layers
+        self.q = nn.Linear(dim, dim)
+        self.k = nn.Linear(dim, dim)
+        self.v = nn.Linear(dim, dim)
+        self.o = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask):
+        """
+        x:   [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
+        # compute attention
+        p = self.dropout.p if self.training else 0.0
+        x = F.scaled_dot_product_attention(q, k, v, mask, p)
+        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
+        # output
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.eps = eps
+        # layers
+        self.attn = SelfAttention(dim, num_heads, dropout, eps)
+        self.norm1 = nn.LayerNorm(dim, eps=eps)
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim),
+            nn.Dropout(dropout))
+        self.norm2 = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x, mask):
+        if self.post_norm:
+            x = self.norm1(x + self.attn(x, mask))
+            x = self.norm2(x + self.ffn(x))
+        else:
+            x = x + self.attn(self.norm1(x), mask)
+            x = x + self.ffn(self.norm2(x))
+        return x
+class XLMRoberta(nn.Module):
+    """
+    XLMRobertaModel with no pooler and no LM head.
+    """
+    def __init__(self,
+                 vocab_size=250002,
+                 max_seq_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 dim=1024,
+                 num_heads=16,
+                 num_layers=24,
+                 post_norm=True,
+                 dropout=0.1,
+                 eps=1e-5):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.post_norm = post_norm
+        self.eps = eps
+        # embeddings
+        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
+        self.type_embedding = nn.Embedding(type_size, dim)
+        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
+        self.dropout = nn.Dropout(dropout)
+        # blocks
+        self.blocks = nn.ModuleList([
+            AttentionBlock(dim, num_heads, post_norm, dropout, eps)
+            for _ in range(num_layers)
+        ])
+        # norm layer
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, ids):
+        """
+        ids: [B, L] of torch.LongTensor.
+        """
+        b, s = ids.shape
+        mask = ids.ne(self.pad_id).long()
+        # embeddings
+        x = self.token_embedding(ids) + \
+            self.type_embedding(torch.zeros_like(ids)) + \
+            self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
+        if self.post_norm:
+            x = self.norm(x)
+        x = self.dropout(x)
+        # blocks
+        mask = torch.where(
+            mask.view(b, 1, 1, s).gt(0), 0.0,
+            torch.finfo(x.dtype).min)
+        for block in self.blocks:
+            x = block(x, mask)
+        # output
+        if not self.post_norm:
+            x = self.norm(x)
+        return x
+def xlm_roberta_large(pretrained=False,
+                      return_tokenizer=False,
+                      device='cpu',
+                      **kwargs):
+    """
+    XLMRobertaLarge adapted from Huggingface.
+    """
+    # params
+    cfg = dict(
+        vocab_size=250002,
+        max_seq_len=514,
+        type_size=1,
+        pad_id=1,
+        dim=1024,
+        num_heads=16,
+        num_layers=24,
+        post_norm=True,
+        dropout=0.1,
+        eps=1e-5)
+    cfg.update(**kwargs)
+    # init model
+    if pretrained:
+        from sora import DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device('meta'):
+            model = XLMRoberta(**cfg)
+        # load checkpoint
+        model.load_state_dict(
+            torch.load(
+                DOWNLOAD_TO_CACHE('models/xlm_roberta/xlm_roberta_large.pth'),
+                map_location=device),
+            assign=True)
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = XLMRoberta(**cfg)
+    # init tokenizer
+    if return_tokenizer:
+        from sora.data import HuggingfaceTokenizer
+        tokenizer = HuggingfaceTokenizer(
+            name='xlm-roberta-large',
+            seq_len=model.text_len,
+            clean='whitespace')
+        return model, tokenizer
+    else:
+        return model
+def pos_interpolate(pos, seq_len):
+    if pos.size(1) == seq_len:
+        return pos
+    else:
+        src_grid = int(math.sqrt(pos.size(1)))
+        tar_grid = int(math.sqrt(seq_len))
+        n = pos.size(1) - src_grid * src_grid
+        return torch.cat([
+            pos[:, :n],
+            F.interpolate(
+                pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
+                    0, 3, 1, 2),
+                size=(tar_grid, tar_grid),
+                mode='bicubic',
+                align_corners=False).flatten(2).transpose(1, 2)
+        ],
+                         dim=1)
+class QuickGELU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(1.702 * x)
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        return super().forward(x).type_as(x)
+class SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 causal=False,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.causal = causal
+        self.attn_dropout = attn_dropout
+        self.proj_dropout = proj_dropout
+        # layers
+        self.to_qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        """
+        x:   [B, L, C].
+        """
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        # compute attention
+        x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        return x
+class SwiGLU(nn.Module):
+    def __init__(self, dim, mid_dim):
+        super().__init__()
+        self.dim = dim
+        self.mid_dim = mid_dim
+        # layers
+        self.fc1 = nn.Linear(dim, mid_dim)
+        self.fc2 = nn.Linear(dim, mid_dim)
+        self.fc3 = nn.Linear(mid_dim, dim)
+    def forward(self, x):
+        x = F.silu(self.fc1(x)) * self.fc2(x)
+        x = self.fc3(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 post_norm=False,
+                 causal=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert activation in ['quick_gelu', 'gelu', 'swi_glu']
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.post_norm = post_norm
+        self.causal = causal
+        self.norm_eps = norm_eps
+        # layers
+        self.norm1 = LayerNorm(dim, eps=norm_eps)
+        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
+                                  proj_dropout)
+        self.norm2 = LayerNorm(dim, eps=norm_eps)
+        if activation == 'swi_glu':
+            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
+        else:
+            self.mlp = nn.Sequential(
+                nn.Linear(dim, int(dim * mlp_ratio)),
+                QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+                nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        if self.post_norm:
+            x = x + self.norm1(self.attn(x))
+            x = x + self.norm2(self.mlp(x))
+        else:
+            x = x + self.attn(self.norm1(x))
+            x = x + self.mlp(self.norm2(x))
+        return x
+class AttentionPool(nn.Module):
+    def __init__(self,
+                 dim,
+                 mlp_ratio,
+                 num_heads,
+                 activation='gelu',
+                 proj_dropout=0.0,
+                 norm_eps=1e-5):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.proj_dropout = proj_dropout
+        self.norm_eps = norm_eps
+        # layers
+        gain = 1.0 / math.sqrt(dim)
+        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.to_q = nn.Linear(dim, dim)
+        self.to_kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.norm = LayerNorm(dim, eps=norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(dim, int(dim * mlp_ratio)),
+            QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
+            nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
+    def forward(self, x):
+        """
+        x:  [B, L, C].
+        """
+        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.to_q(self.cls_embedding).view(1, 1, n*d).expand(b, -1, -1)
+        k, v = self.to_kv(x).chunk(2, dim=-1)
+        # compute attention
+        x = flash_attention(q, k, v, num_heads=self.num_heads, compatibility_mode=True)
+        x = x.reshape(b, 1, c)
+        # output
+        x = self.proj(x)
+        x = F.dropout(x, self.proj_dropout, self.training)
+        # mlp
+        x = x + self.mlp(self.norm(x))
+        return x[:, 0]
+class VisionTransformer(nn.Module):
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 dim=768,
+                 mlp_ratio=4,
+                 out_dim=512,
+                 num_heads=12,
+                 num_layers=12,
+                 pool_type='token',
+                 pre_norm=True,
+                 post_norm=False,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        if image_size % patch_size != 0:
+            print(
+                '[WARNING] image_size is not divisible by patch_size',
+                flush=True)
+        assert pool_type in ('token', 'token_fc', 'attn_pool')
+        out_dim = out_dim or dim
+        super().__init__()
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = (image_size // patch_size)**2
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.pool_type = pool_type
+        self.post_norm = post_norm
+        self.norm_eps = norm_eps
+        # embeddings
+        gain = 1.0 / math.sqrt(dim)
+        self.patch_embedding = nn.Conv2d(
+            3,
+            dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=not pre_norm)
+        if pool_type in ('token', 'token_fc'):
+            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
+        self.pos_embedding = nn.Parameter(gain * torch.randn(
+            1, self.num_patches +
+            (1 if pool_type in ('token', 'token_fc') else 0), dim))
+        self.dropout = nn.Dropout(embedding_dropout)
+        # transformer
+        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
+        self.transformer = nn.Sequential(*[
+            AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
+                           activation, attn_dropout, proj_dropout, norm_eps)
+            for _ in range(num_layers)
+        ])
+        self.post_norm = LayerNorm(dim, eps=norm_eps)
+        # head
+        if pool_type == 'token':
+            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
+        elif pool_type == 'token_fc':
+            self.head = nn.Linear(dim, out_dim)
+        elif pool_type == 'attn_pool':
+            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
+                                      proj_dropout, norm_eps)
+    def forward(self, x, interpolation=False, use_31_block=False):
+        b = x.size(0)
+        # embeddings
+        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
+        if self.pool_type in ('token', 'token_fc'):
+            x = torch.cat([self.cls_embedding.expand(b, -1, -1).to(dtype=x.dtype, device=x.device), x], dim=1)
+        if interpolation:
+            e = pos_interpolate(self.pos_embedding, x.size(1))
+        else:
+            e = self.pos_embedding
+        e = e.to(dtype=x.dtype, device=x.device)
+        x = self.dropout(x + e)
+        if self.pre_norm is not None:
+            x = self.pre_norm(x)
+        # transformer
+        if use_31_block:
+            x = self.transformer[:-1](x)
+            return x
+        else:
+            x = self.transformer(x)
+            return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=512,
+                 image_size=224,
+                 patch_size=16,
+                 vision_dim=768,
+                 vision_mlp_ratio=4,
+                 vision_heads=12,
+                 vision_layers=12,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 vocab_size=49408,
+                 text_len=77,
+                 text_dim=512,
+                 text_mlp_ratio=4,
+                 text_heads=8,
+                 text_layers=12,
+                 text_causal=True,
+                 text_pool='argmax',
+                 text_head_bias=False,
+                 logit_bias=None,
+                 activation='quick_gelu',
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pool = vision_pool
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.vocab_size = vocab_size
+        self.text_len = text_len
+        self.text_dim = text_dim
+        self.text_mlp_ratio = text_mlp_ratio
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_causal = text_causal
+        self.text_pool = text_pool
+        self.text_head_bias = text_head_bias
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = TextTransformer(
+            vocab_size=vocab_size,
+            text_len=text_len,
+            dim=text_dim,
+            mlp_ratio=text_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=text_heads,
+            num_layers=text_layers,
+            causal=text_causal,
+            pool_type=text_pool,
+            head_bias=text_head_bias,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+        if logit_bias is not None:
+            self.logit_bias = nn.Parameter(logit_bias * torch.ones([]))
+        # initialize weights
+        self.init_weights()
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long. Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def init_weights(self):
+        # embeddings
+        nn.init.normal_(self.textual.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.visual.patch_embedding.weight, std=0.1)
+        # attentions
+        for modality in ['visual', 'textual']:
+            dim = self.vision_dim if modality == 'visual' else self.text_dim
+            transformer = getattr(self, modality).transformer
+            proj_gain = (1.0 / math.sqrt(dim)) * (
+                1.0 / math.sqrt(2 * len(transformer)))
+            attn_gain = 1.0 / math.sqrt(dim)
+            mlp_gain = 1.0 / math.sqrt(2.0 * dim)
+            for block in transformer:
+                nn.init.normal_(block.attn.to_qkv.weight, std=attn_gain)
+                nn.init.normal_(block.attn.proj.weight, std=proj_gain)
+                nn.init.normal_(block.mlp[0].weight, std=mlp_gain)
+                nn.init.normal_(block.mlp[2].weight, std=proj_gain)
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+class XLMRobertaWithHead(XLMRoberta):
+    def __init__(self, **kwargs):
+        self.out_dim = kwargs.pop('out_dim')
+        super().__init__(**kwargs)
+        # head
+        mid_dim = (self.dim + self.out_dim) // 2
+        self.head = nn.Sequential(
+            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
+            nn.Linear(mid_dim, self.out_dim, bias=False))
+    def forward(self, ids):
+        # xlm-roberta
+        x = super().forward(ids)
+        # average pooling
+        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
+        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
+        # head
+        x = self.head(x)
+        return x
+class XLMRobertaCLIP(nn.Module):
+    def __init__(self,
+                 embed_dim=1024,
+                 image_size=224,
+                 patch_size=14,
+                 vision_dim=1280,
+                 vision_mlp_ratio=4,
+                 vision_heads=16,
+                 vision_layers=32,
+                 vision_pool='token',
+                 vision_pre_norm=True,
+                 vision_post_norm=False,
+                 activation='gelu',
+                 vocab_size=250002,
+                 max_text_len=514,
+                 type_size=1,
+                 pad_id=1,
+                 text_dim=1024,
+                 text_heads=16,
+                 text_layers=24,
+                 text_post_norm=True,
+                 text_dropout=0.1,
+                 attn_dropout=0.0,
+                 proj_dropout=0.0,
+                 embedding_dropout=0.0,
+                 norm_eps=1e-5):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.vision_dim = vision_dim
+        self.vision_mlp_ratio = vision_mlp_ratio
+        self.vision_heads = vision_heads
+        self.vision_layers = vision_layers
+        self.vision_pre_norm = vision_pre_norm
+        self.vision_post_norm = vision_post_norm
+        self.activation = activation
+        self.vocab_size = vocab_size
+        self.max_text_len = max_text_len
+        self.type_size = type_size
+        self.pad_id = pad_id
+        self.text_dim = text_dim
+        self.text_heads = text_heads
+        self.text_layers = text_layers
+        self.text_post_norm = text_post_norm
+        self.norm_eps = norm_eps
+        # models
+        self.visual = VisionTransformer(
+            image_size=image_size,
+            patch_size=patch_size,
+            dim=vision_dim,
+            mlp_ratio=vision_mlp_ratio,
+            out_dim=embed_dim,
+            num_heads=vision_heads,
+            num_layers=vision_layers,
+            pool_type=vision_pool,
+            pre_norm=vision_pre_norm,
+            post_norm=vision_post_norm,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            proj_dropout=proj_dropout,
+            embedding_dropout=embedding_dropout,
+            norm_eps=norm_eps)
+        self.textual = None
+        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
+    def forward(self, imgs, txt_ids):
+        """
+        imgs:       [B, 3, H, W] of torch.float32.
+        - mean:     [0.48145466, 0.4578275, 0.40821073]
+        - std:      [0.26862954, 0.26130258, 0.27577711]
+        txt_ids:    [B, L] of torch.long.
+                    Encoded by data.CLIPTokenizer.
+        """
+        xi = self.visual(imgs)
+        xt = self.textual(txt_ids)
+        return xi, xt
+    def param_groups(self):
+        groups = [{
+            'params': [
+                p for n, p in self.named_parameters()
+                if 'norm' in n or n.endswith('bias')
+            ],
+            'weight_decay': 0.0
+        }, {
+            'params': [
+                p for n, p in self.named_parameters()
+                if not ('norm' in n or n.endswith('bias'))
+            ]
+        }]
+        return groups
+def _clip(pretrained=False,
+          pretrained_name=None,
+          model_cls=CLIP,
+          return_transforms=False,
+          return_tokenizer=False,
+          tokenizer_padding='eos',
+          dtype=torch.float32,
+          device='cpu',
+          **kwargs):
+    # init model
+    if pretrained and pretrained_name:
+        from sora import BUCKET, DOWNLOAD_TO_CACHE
+        # init a meta model
+        with torch.device('meta'):
+            model = model_cls(**kwargs)
+        # checkpoint path
+        checkpoint = f'models/clip/{pretrained_name}'
+        if dtype in (torch.float16, torch.bfloat16):
+            suffix = '-' + {
+                torch.float16: 'fp16',
+                torch.bfloat16: 'bf16'
+            }[dtype]
+            if object_exists(BUCKET, f'{checkpoint}{suffix}.pth'):
+                checkpoint = f'{checkpoint}{suffix}'
+        checkpoint += '.pth'
+        # load
+        model.load_state_dict(
+            torch.load(DOWNLOAD_TO_CACHE(checkpoint), map_location=device),
+            assign=True,
+            strict=False)
+    else:
+        # init a model on device
+        with torch.device(device):
+            model = model_cls(**kwargs)
+    # set device
+    output = (model,)
+    # init transforms
+    if return_transforms:
+        # mean and std
+        if 'siglip' in pretrained_name.lower():
+            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
+        else:
+            mean = [0.48145466, 0.4578275, 0.40821073]
+            std = [0.26862954, 0.26130258, 0.27577711]
+        # transforms
+        transforms = T.Compose([
+            T.Resize((model.image_size, model.image_size),
+                     interpolation=T.InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)
+        ])
+        output += (transforms,)
+    # init tokenizer
+    if return_tokenizer:
+        from sora import data
+        if 'siglip' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name=f'timm/{pretrained_name}',
+                seq_len=model.text_len,
+                clean='canonicalize')
+        elif 'xlm' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name='xlm-roberta-large',
+                seq_len=model.max_text_len - 2,
+                clean='whitespace')
+        elif 'mba' in pretrained_name.lower():
+            tokenizer = data.HuggingfaceTokenizer(
+                name='facebook/xlm-roberta-xl',
+                seq_len=model.max_text_len - 2,
+                clean='whitespace')
+        else:
+            tokenizer = data.CLIPTokenizer(
+                seq_len=model.text_len, padding=tokenizer_padding)
+        output += (tokenizer,)
+    return output[0] if len(output) == 1 else output
+def clip_xlm_roberta_vit_h_14(
+        pretrained=False,
+        pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
+        **kwargs):
+    cfg = dict(
+        embed_dim=1024,
+        image_size=224,
+        patch_size=14,
+        vision_dim=1280,
+        vision_mlp_ratio=4,
+        vision_heads=16,
+        vision_layers=32,
+        vision_pool='token',
+        activation='gelu',
+        vocab_size=250002,
+        max_text_len=514,
+        type_size=1,
+        pad_id=1,
+        text_dim=1024,
+        text_heads=16,
+        text_layers=24,
+        text_post_norm=True,
+        text_dropout=0.1,
+        attn_dropout=0.0,
+        proj_dropout=0.0,
+        embedding_dropout=0.0)
+    cfg.update(**kwargs)
+    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
+class WanImageEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # init model
+        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
+            pretrained=False,
+            return_transforms=True,
+            return_tokenizer=False,
+            dtype=torch.float32,
+            device="cpu")
+    def encode_image(self, videos):
+        # preprocess
+        size = (self.model.image_size,) * 2
+        videos = torch.cat([
+            F.interpolate(
+                u,
+                size=size,
+                mode='bicubic',
+                align_corners=False) for u in videos
+        ])
+        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
+        # forward
+        dtype = next(iter(self.model.visual.parameters())).dtype
+        videos = videos.to(dtype)
+        out = self.model.visual(videos, use_31_block=True)
+        return out
+    @staticmethod
+    def state_dict_converter():
+        return WanImageEncoderStateDictConverter()
+class WanImageEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.startswith("textual."):
+                continue
+            name = "model." + name
+            state_dict_[name] = param
+        return state_dict_

diffsynth/models/wan_video_motion_controller.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+import torch.nn as nn
+from .wan_video_dit import sinusoidal_embedding_1d
+class WanMotionControllerModel(torch.nn.Module):
+    def __init__(self, freq_dim=256, dim=1536):
+        super().__init__()
+        self.freq_dim = freq_dim
+        self.linear = nn.Sequential(
+            nn.Linear(freq_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim * 6),
+        )
+    def forward(self, motion_bucket_id):
+        emb = sinusoidal_embedding_1d(self.freq_dim, motion_bucket_id * 10)
+        emb = self.linear(emb)
+        return emb
+    def init(self):
+        state_dict = self.linear[-1].state_dict()
+        state_dict = {i: state_dict[i] * 0 for i in state_dict}
+        self.linear[-1].load_state_dict(state_dict)
+    @staticmethod
+    def state_dict_converter():
+        return WanMotionControllerModelDictConverter()
+class WanMotionControllerModelDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

diffsynth/models/wan_video_text_encoder.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def fp16_clamp(x):
+    if x.dtype == torch.float16 and torch.isinf(x).any():
+        clamp = torch.finfo(x.dtype).max - 1000
+        x = torch.clamp(x, min=-clamp, max=clamp)
+    return x
+class GELU(nn.Module):
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(
+            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class T5LayerNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super(T5LayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
+                            self.eps)
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            x = x.type_as(self.weight)
+        return self.weight * x
+class T5Attention(nn.Module):
+    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
+        assert dim_attn % num_heads == 0
+        super(T5Attention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.num_heads = num_heads
+        self.head_dim = dim_attn // num_heads
+        # layers
+        self.q = nn.Linear(dim, dim_attn, bias=False)
+        self.k = nn.Linear(dim, dim_attn, bias=False)
+        self.v = nn.Linear(dim, dim_attn, bias=False)
+        self.o = nn.Linear(dim_attn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, context=None, mask=None, pos_bias=None):
+        """
+        x:          [B, L1, C].
+        context:    [B, L2, C] or None.
+        mask:       [B, L2] or [B, L1, L2] or None.
+        """
+        # check inputs
+        context = x if context is None else context
+        b, n, c = x.size(0), self.num_heads, self.head_dim
+        # compute query, key, value
+        q = self.q(x).view(b, -1, n, c)
+        k = self.k(context).view(b, -1, n, c)
+        v = self.v(context).view(b, -1, n, c)
+        # attention bias
+        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
+        if pos_bias is not None:
+            attn_bias += pos_bias
+        if mask is not None:
+            assert mask.ndim in [2, 3]
+            mask = mask.view(b, 1, 1,
+                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
+            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
+        # compute attention (T5 does not use scaling)
+        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
+        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
+        x = torch.einsum('bnij,bjnc->binc', attn, v)
+        # output
+        x = x.reshape(b, -1, n * c)
+        x = self.o(x)
+        x = self.dropout(x)
+        return x
+class T5FeedForward(nn.Module):
+    def __init__(self, dim, dim_ffn, dropout=0.1):
+        super(T5FeedForward, self).__init__()
+        self.dim = dim
+        self.dim_ffn = dim_ffn
+        # layers
+        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
+        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
+        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x) * self.gate(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class T5SelfAttention(nn.Module):
+    def __init__(self,
+                 dim,
+                 dim_attn,
+                 dim_ffn,
+                 num_heads,
+                 num_buckets,
+                 shared_pos=True,
+                 dropout=0.1):
+        super(T5SelfAttention, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.norm1 = T5LayerNorm(dim)
+        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
+        self.norm2 = T5LayerNorm(dim)
+        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
+        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True)
+    def forward(self, x, mask=None, pos_bias=None):
+        e = pos_bias if self.shared_pos else self.pos_embedding(
+            x.size(1), x.size(1))
+        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
+        x = fp16_clamp(x + self.ffn(self.norm2(x)))
+        return x
+class T5RelativeEmbedding(nn.Module):
+    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
+        super(T5RelativeEmbedding, self).__init__()
+        self.num_buckets = num_buckets
+        self.num_heads = num_heads
+        self.bidirectional = bidirectional
+        self.max_dist = max_dist
+        # layers
+        self.embedding = nn.Embedding(num_buckets, num_heads)
+    def forward(self, lq, lk):
+        device = self.embedding.weight.device
+        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
+        #     torch.arange(lq).unsqueeze(1).to(device)
+        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
+            torch.arange(lq, device=device).unsqueeze(1)
+        rel_pos = self._relative_position_bucket(rel_pos)
+        rel_pos_embeds = self.embedding(rel_pos)
+        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
+            0)  # [1, N, Lq, Lk]
+        return rel_pos_embeds.contiguous()
+    def _relative_position_bucket(self, rel_pos):
+        # preprocess
+        if self.bidirectional:
+            num_buckets = self.num_buckets // 2
+            rel_buckets = (rel_pos > 0).long() * num_buckets
+            rel_pos = torch.abs(rel_pos)
+        else:
+            num_buckets = self.num_buckets
+            rel_buckets = 0
+            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
+        # embeddings for small and large positions
+        max_exact = num_buckets // 2
+        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
+                                     math.log(self.max_dist / max_exact) *
+                                     (num_buckets - max_exact)).long()
+        rel_pos_large = torch.min(
+            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
+        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
+        return rel_buckets
+def init_weights(m):
+    if isinstance(m, T5LayerNorm):
+        nn.init.ones_(m.weight)
+    elif isinstance(m, T5FeedForward):
+        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
+    elif isinstance(m, T5Attention):
+        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
+        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
+        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
+    elif isinstance(m, T5RelativeEmbedding):
+        nn.init.normal_(
+            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
+class WanTextEncoder(torch.nn.Module):
+    def __init__(self,
+                 vocab=256384,
+                 dim=4096,
+                 dim_attn=4096,
+                 dim_ffn=10240,
+                 num_heads=64,
+                 num_layers=24,
+                 num_buckets=32,
+                 shared_pos=False,
+                 dropout=0.1):
+        super(WanTextEncoder, self).__init__()
+        self.dim = dim
+        self.dim_attn = dim_attn
+        self.dim_ffn = dim_ffn
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.num_buckets = num_buckets
+        self.shared_pos = shared_pos
+        # layers
+        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
+            else nn.Embedding(vocab, dim)
+        self.pos_embedding = T5RelativeEmbedding(
+            num_buckets, num_heads, bidirectional=True) if shared_pos else None
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([
+            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
+                            shared_pos, dropout) for _ in range(num_layers)
+        ])
+        self.norm = T5LayerNorm(dim)
+        # initialize weights
+        self.apply(init_weights)
+    def forward(self, ids, mask=None):
+        x = self.token_embedding(ids)
+        x = self.dropout(x)
+        e = self.pos_embedding(x.size(1),
+                               x.size(1)) if self.shared_pos else None
+        for block in self.blocks:
+            x = block(x, mask, pos_bias=e)
+        x = self.norm(x)
+        x = self.dropout(x)
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return WanTextEncoderStateDictConverter()
+class WanTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

diffsynth/models/wan_video_vace.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+from .wan_video_dit import DiTBlock
+from .utils import hash_state_dict_keys
+class VaceWanAttentionBlock(DiTBlock):
+    def __init__(self, has_image_input, dim, num_heads, ffn_dim, eps=1e-6, block_id=0):
+        super().__init__(has_image_input, dim, num_heads, ffn_dim, eps=eps)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = torch.nn.Linear(self.dim, self.dim)
+        self.after_proj = torch.nn.Linear(self.dim, self.dim)
+    def forward(self, c, x, context, t_mod, freqs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+        c = super().forward(c, context, t_mod, freqs)
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+class VaceWanModel(torch.nn.Module):
+    def __init__(
+        self,
+        vace_layers=(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28),
+        vace_in_dim=96,
+        patch_size=(1, 2, 2),
+        has_image_input=False,
+        dim=1536,
+        num_heads=12,
+        ffn_dim=8960,
+        eps=1e-6,
+    ):
+        super().__init__()
+        self.vace_layers = vace_layers
+        self.vace_in_dim = vace_in_dim
+        self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
+        # vace blocks
+        self.vace_blocks = torch.nn.ModuleList([
+            VaceWanAttentionBlock(has_image_input, dim, num_heads, ffn_dim, eps, block_id=i)
+            for i in self.vace_layers
+        ])
+        # vace patch embeddings
+        self.vace_patch_embedding = torch.nn.Conv3d(vace_in_dim, dim, kernel_size=patch_size, stride=patch_size)
+    def forward(
+        self, x, vace_context, context, t_mod, freqs,
+        use_gradient_checkpointing: bool = False,
+        use_gradient_checkpointing_offload: bool = False,
+    ):
+        c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
+        c = [u.flatten(2).transpose(1, 2) for u in c]
+        c = torch.cat([
+            torch.cat([u, u.new_zeros(1, x.shape[1] - u.size(1), u.size(2))],
+                      dim=1) for u in c
+        ])
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block in self.vace_blocks:
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    c = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        c, x, context, t_mod, freqs,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                c = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    c, x, context, t_mod, freqs,
+                    use_reentrant=False,
+                )
+            else:
+                c = block(c, x, context, t_mod, freqs)
+        hints = torch.unbind(c)[:-1]
+        return hints
+    @staticmethod
+    def state_dict_converter():
+        return VaceWanModelDictConverter()
+class VaceWanModelDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("vace")}
+        if hash_state_dict_keys(state_dict_) == '3b2726384e4f64837bdf216eea3f310d': # vace 14B
+            config = {
+                "vace_layers": (0, 5, 10, 15, 20, 25, 30, 35),
+                "vace_in_dim": 96,
+                "patch_size": (1, 2, 2),
+                "has_image_input": False,
+                "dim": 5120,
+                "num_heads": 40,
+                "ffn_dim": 13824,
+                "eps": 1e-06,
+            }
+        else:
+            config = {}
+        return state_dict_, config

diffsynth/models/wan_video_vae.py ADDED Viewed

	@@ -0,0 +1,828 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from tqdm import tqdm
+CACHE_T = 2
+def check_is_instance(model, module_class):
+    if isinstance(model, module_class):
+        return True
+    if hasattr(model, "module") and isinstance(model.module, module_class):
+        return True
+    return False
+def block_causal_mask(x, block_size):
+    # params
+    b, n, s, _, device = *x.size(), x.device
+    assert s % block_size == 0
+    num_blocks = s // block_size
+    # build mask
+    mask = torch.zeros(b, n, s, s, dtype=torch.bool, device=device)
+    for i in range(num_blocks):
+        mask[:, :,
+             i * block_size:(i + 1) * block_size, :(i + 1) * block_size] = 1
+    return mask
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else
+                    -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                nn.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(dim,
+                                          dim * 2, (3, 1, 1),
+                                          padding=(1, 0, 0))
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim,
+                                          dim, (3, 1, 1),
+                                          stride=(2, 1, 1),
+                                          padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(
+            0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            # attn_mask=block_causal_mask(q, block_size=h * w)
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+class Encoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(out_dim, out_dim, dropout),
+                                    AttentionBlock(out_dim),
+                                    ResidualBlock(out_dim, out_dim, dropout))
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, z_dim, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        # downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(ResidualBlock(dims[0], dims[0], dropout),
+                                    AttentionBlock(dims[0]),
+                                    ResidualBlock(dims[0], dims[0], dropout))
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(),
+                                  CausalConv3d(out_dim, 3, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        # middle
+        for layer in self.middle:
+            if check_is_instance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # head
+        for layer in self.head:
+            if check_is_instance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if check_is_instance(m, CausalConv3d):
+            count += 1
+    return count
+class VideoVAE_(nn.Module):
+    def __init__(self,
+                 dim=96,
+                 z_dim=16,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+    def encode(self, x, scale):
+        self.clear_cache()
+        # cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :],
+                                   feat_cache=self._enc_feat_map,
+                                   feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                                    feat_cache=self._enc_feat_map,
+                                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=mu.dtype, device=mu.device) for s in scale]
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=mu.dtype, device=mu.device)
+            mu = (mu - scale[0]) * scale[1]
+        return mu
+    def decode(self, z, scale):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+        if isinstance(scale[0], torch.Tensor):
+            scale = [s.to(dtype=z.dtype, device=z.device) for s in scale]
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
+                1, self.z_dim, 1, 1, 1)
+        else:
+            scale = scale.to(dtype=z.dtype, device=z.device)
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(x[:, :, i:i + 1, :, :],
+                                   feat_cache=self._feat_map,
+                                   feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(x[:, :, i:i + 1, :, :],
+                                    feat_cache=self._feat_map,
+                                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)  # may add tensor offload
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+class WanVideoVAE(nn.Module):
+    def __init__(self, z_dim=16):
+        super().__init__()
+        mean = [
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]
+        std = [
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]
+        self.mean = torch.tensor(mean)
+        self.std = torch.tensor(std)
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.model = VideoVAE_(z_dim=z_dim).eval().requires_grad_(False)
+        self.upsampling_factor = 8
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,))
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, _, H, W = data.shape
+        h = self.build_1d_mask(H, is_bound[0], is_bound[1], border_width[0])
+        w = self.build_1d_mask(W, is_bound[2], is_bound[3], border_width[1])
+        h = repeat(h, "H -> H W", H=H, W=W)
+        w = repeat(w, "W -> H W", H=H, W=W)
+        mask = torch.stack([h, w]).min(dim=0).values
+        mask = rearrange(mask, "H W -> 1 1 1 H W")
+        return mask
+    def tiled_decode(self, hidden_states, device, tile_size, tile_stride):
+        _, _, T, H, W = hidden_states.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H):
+                continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W):
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = T * 4 - 3
+        weight = torch.zeros((1, 1, out_T, H * self.upsampling_factor, W *
+                             self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        values = torch.zeros((1, 3, out_T, H * self.upsampling_factor, W *
+                             self.upsampling_factor), dtype=hidden_states.dtype, device=data_device)
+        disable_flag = True
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE decoding", disable=disable_flag):
+            hidden_states_batch = hidden_states[:, :, :, h:h_, w:w_].to(
+                computation_device)
+            hidden_states_batch = self.model.decode(
+                hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=((size_h - stride_h) * self.upsampling_factor,
+                              (size_w - stride_w) * self.upsampling_factor)
+            ).to(dtype=hidden_states.dtype, device=data_device)
+            target_h = h * self.upsampling_factor
+            target_w = w * self.upsampling_factor
+            import pdb
+            # pdb.set_trace()
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        values = values.clamp_(-1, 1)
+        return values
+    def tiled_encode(self, video, device, tile_size, tile_stride):
+        _, _, T, H, W = video.shape
+        size_h, size_w = tile_size
+        stride_h, stride_w = tile_stride
+        # Split tasks
+        tasks = []
+        for h in range(0, H, stride_h):
+            if (h-stride_h >= 0 and h-stride_h+size_h >= H):
+                continue
+            for w in range(0, W, stride_w):
+                if (w-stride_w >= 0 and w-stride_w+size_w >= W):
+                    continue
+                h_, w_ = h + size_h, w + size_w
+                tasks.append((h, h_, w, w_))
+        data_device = "cpu"
+        computation_device = device
+        out_T = (T + 3) // 4
+        weight = torch.zeros((1, 1, out_T, H // self.upsampling_factor, W //
+                             self.upsampling_factor), dtype=video.dtype, device=data_device)
+        values = torch.zeros((1, 16, out_T, H // self.upsampling_factor, W //
+                             self.upsampling_factor), dtype=video.dtype, device=data_device)
+        disable_flag = True
+        for h, h_, w, w_ in tqdm(tasks, desc="VAE encoding", disable=disable_flag):
+            hidden_states_batch = video[:, :, :,
+                                        h:h_, w:w_].to(computation_device)
+            hidden_states_batch = self.model.encode(
+                hidden_states_batch, self.scale).to(data_device)
+            mask = self.build_mask(
+                hidden_states_batch,
+                is_bound=(h == 0, h_ >= H, w == 0, w_ >= W),
+                border_width=((size_h - stride_h) // self.upsampling_factor,
+                              (size_w - stride_w) // self.upsampling_factor)
+            ).to(dtype=video.dtype, device=data_device)
+            target_h = h // self.upsampling_factor
+            target_w = w // self.upsampling_factor
+            values[
+                :,
+                :,
+                :,
+                target_h:target_h + hidden_states_batch.shape[3],
+                target_w:target_w + hidden_states_batch.shape[4],
+            ] += hidden_states_batch * mask
+            weight[
+                :,
+                :,
+                :,
+                target_h: target_h + hidden_states_batch.shape[3],
+                target_w: target_w + hidden_states_batch.shape[4],
+            ] += mask
+        values = values / weight
+        return values
+    def single_encode(self, video, device):
+        video = video.to(device)
+        x = self.model.encode(video, self.scale)
+        return x
+    def single_decode(self, hidden_state, device):
+        hidden_state = hidden_state.to(device)
+        video = self.model.decode(hidden_state, self.scale)
+        return video.clamp_(-1, 1)
+    def encode(self, videos, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        # videos = [video.to("cpu") for video in videos]
+        hidden_states = []
+        for video in videos:
+            video = video.unsqueeze(0)
+            if tiled:
+                tile_size = (tile_size[0] * 8, tile_size[1] * 8)
+                tile_stride = (tile_stride[0] * 8, tile_stride[1] * 8)
+                hidden_state = self.tiled_encode(
+                    video, device, tile_size, tile_stride)
+            else:
+                hidden_state = self.single_encode(video, device)
+            hidden_state = hidden_state.squeeze(0)
+            hidden_states.append(hidden_state)
+        hidden_states = torch.stack(hidden_states)
+        # TODO
+        # if tiled:
+        #     hidden_states = self.tiled_encode(
+        #         videos, device, tile_size, tile_stride)
+        # else:
+        #     hidden_states = self.single_encode(videos, device)
+        return hidden_states
+    def decode(self, hidden_states, device, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+        video = []
+        for _hidden_states in hidden_states:
+            _hidden_states = _hidden_states.unsqueeze(0)
+            if tiled:
+                video.append(
+                    self.tiled_decode(_hidden_states, device, tile_size,
+                                      tile_stride))
+            else:
+                video.append(self.single_decode(_hidden_states, device))
+        video = torch.cat(video, dim=0)
+        # TODO
+        # if tiled:
+        #     video = self.tiled_decode(
+        #         hidden_states, device, tile_size, tile_stride)
+        # else:
+        #     video = self.single_decode(hidden_states, device)
+        return video
+    @staticmethod
+    def state_dict_converter():
+        return WanVideoVAEStateDictConverter()
+class WanVideoVAEStateDictConverter:
+    def __init__(self):
+        pass
+    def from_civitai(self, state_dict):
+        state_dict_ = {}
+        if 'model_state' in state_dict:
+            state_dict = state_dict['model_state']
+        for name in state_dict:
+            state_dict_['model.' + name] = state_dict[name]
+        return state_dict_

diffsynth/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_video_new_determine import WanVideoPipeline

diffsynth/pipelines/wan_video_new_determine.py ADDED Viewed

	@@ -0,0 +1,1730 @@

+import glob
+import os
+import time
+import types
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from einops import rearrange, reduce, repeat
+# from modelscope import snapshot_download
+from huggingface_hub import snapshot_download
+from PIL import Image
+from tqdm import tqdm
+from typing_extensions import Literal
+from ..models import ModelManager, load_state_dict
+from ..models.wan_video_dit import RMSNorm, WanModel, sinusoidal_embedding_1d
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_motion_controller import WanMotionControllerModel
+# from ..model.
+from ..models.wan_video_text_encoder import (T5LayerNorm, T5RelativeEmbedding,
+                                             WanTextEncoder)
+from ..models.wan_video_vace import VaceWanModel
+from ..models.wan_video_vae import (CausalConv3d, RMS_norm, Upsample,
+                                    WanVideoVAE)
+from ..schedulers.flow_match import FlowMatchScheduler
+# from ..prompters import WanPrompter
+from ..vram_management import (AutoWrappedLinear, AutoWrappedModule,
+                               WanAutoCastLayerNorm, enable_vram_management)
+class BasePipeline(torch.nn.Module):
+    def __init__(
+        self,
+        device="cuda",
+        torch_dtype=torch.float16,
+        height_division_factor=64,
+        width_division_factor=64,
+        time_division_factor=None,
+        time_division_remainder=None,
+    ):
+        super().__init__()
+        # The device and torch_dtype is used for the storage of intermediate variables, not models.
+        self.device = device
+        self.torch_dtype = torch_dtype
+        # The following parameters are used for shape check.
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        self.vram_management_enabled = False
+    def to(self, *args, **kwargs):
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(
+            *args, **kwargs
+        )
+        if device is not None:
+            self.device = device
+        if dtype is not None:
+            self.torch_dtype = dtype
+        super().to(*args, **kwargs)
+        return self
+    def check_resize_height_width(self, height, width, num_frames=None):
+        # Shape check
+        # print(
+        #     f"height, width, time division factor: {self.height_division_factor}, {self.width_division_factor}, {self.time_division_factor}, time division remainder: {self.time_division_remainder}"
+        # )
+        assert (
+            height % self.height_division_factor == 0
+        ), f"height {height} is not divisible by {self.height_division_factor}."
+        assert (
+            width % self.width_division_factor == 0
+        ), f"width {width} is not divisible by {self.width_division_factor}."
+        assert (num_frames is not None) and (
+            (num_frames + self.time_division_factor) % self.time_division_factor
+            == self.time_division_remainder
+        ), f"num_frames {num_frames} is not divisible by {self.time_division_factor} with remainder {self.time_division_remainder}."
+        return height, width, num_frames
+    def preprocess_image(
+        self,
+        image,
+        torch_dtype=None,
+        device=None,
+        pattern="B C H W",
+        min_value=-1,
+        max_value=1,
+    ):
+        # Transform a PIL.Image to torch.Tensor
+        # print(f"Image size: {image.size}, dtype: {image.mode}")
+        # assert isinstance(image, torch.Tensor), "Image must be a torch.Tensor."
+        # C H W
+        if isinstance(image, torch.Tensor):
+            # C H W
+            # print(f"Image shape {image.shape}")
+            assert (len(image.shape) == 3 and image.shape[0] == 3) or (
+                len(image.shape) == 4 and image.shape[1] == 3
+            ), "Image tensor must be in 3 H W or B 3 H W format."
+            image = image.to(
+                dtype=torch_dtype or self.torch_dtype, device=device or self.device
+            )
+            image = image * ((max_value - min_value)) + min_value
+            if len(image.shape) == 3:
+                image = image.unsqueeze(0)  # Add batch dimension
+        else:
+            image = torch.Tensor(np.array(image, dtype=np.float32))
+            image = image.to(
+                dtype=torch_dtype or self.torch_dtype, device=device or self.device
+            )
+            image = image * ((max_value - min_value) / 255) + min_value
+            image = repeat(
+                image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {})
+            )
+        return image
+    def preprocess_video(
+        self,
+        video,
+        torch_dtype=None,
+        device=None,
+        pattern="B C T H W",
+        min_value=-1,
+        max_value=1,
+    ):
+        video = [
+            self.preprocess_image(
+                image,
+                torch_dtype=torch_dtype,
+                device=device,
+                min_value=min_value,
+                max_value=max_value,
+            )
+            for image in video
+        ]
+        video = torch.stack(video, dim=pattern.index("T") // 2)
+        return video
+    def vae_output_to_image(
+        self, vae_output, pattern="B C H W", min_value=-1, max_value=1
+    ):
+        # Transform a torch.Tensor to PIL.Image
+        if pattern != "H W C":
+            vae_output = reduce(
+                vae_output, f"{pattern} -> H W C", reduction="mean")
+        # image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(
+        #     0, 255
+        # )
+        image = (vae_output - min_value) * (255.0 / (max_value - min_value))
+        image = image.clamp(0.0, 255.0)
+        image = image.to(device="cpu", dtype=torch.float32)
+        image = image.numpy()
+        # image = Image.fromarray(image.numpy())
+        return image
+    def vae_output_to_video(
+        self, vae_output, pattern="B C T H W", min_value=-1, max_value=1
+    ):
+        # Transform a torch.Tensor to list of PIL.Image
+        # if pattern != "T H W C":
+        #     vae_output = reduce(
+        #         vae_output, f"{pattern} -> T H W C", reduction="mean")
+        if vae_output.ndim == 5:  # B C T H W
+            assert (
+                vae_output.shape[1] == 3
+            ), f"vae_output shape {vae_output.shape} is not valid. Expected 5D tensor with 3 channels on the second dimension."
+            vae_output = vae_output.permute(0, 2, 3, 4, 1)
+            # print(f"vae_output shape after permute: {vae_output.shape}")
+            video = vae_output.to(device="cpu", dtype=torch.float32).numpy()
+            video = (video + 1.0) / 2.0
+            # print(f"Video range before clip: {video.min()} to {video.max()}")
+            video = video.clip(0.0, 1.0)
+        #     for _video in vae_output:
+        #         video.append(
+        #             [
+        #                 self.vae_output_to_image(
+        #                     image,
+        #                     pattern="H W C",
+        #                     min_value=min_value,
+        #                     max_value=max_value,
+        #                 )
+        #                 for image in _video
+        #             ]
+        #         )
+        # else:
+        #     raise ValueError(
+        #         f"Invalid vae_output shape {vae_output.shape}. Expected 5D tensor."
+        #     )
+        return video
+    def load_models_to_device(self, model_names=[]):
+        if self.vram_management_enabled:
+            # offload models
+            for name, model in self.named_children():
+                if name not in model_names:
+                    if (
+                        hasattr(model, "vram_management_enabled")
+                        and model.vram_management_enabled
+                    ):
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+            torch.cuda.empty_cache()
+            # onload models
+            for name, model in self.named_children():
+                if name in model_names:
+                    if (
+                        hasattr(model, "vram_management_enabled")
+                        and model.vram_management_enabled
+                    ):
+                        for module in model.modules():
+                            if hasattr(module, "onload"):
+                                module.onload()
+                    else:
+                        model.to(self.device)
+    def generate_noise(
+        self,
+        shape,
+        seed=None,
+        rand_device="cpu",
+        rand_torch_dtype=torch.float32,
+        device=None,
+        torch_dtype=None,
+    ):
+        # Initialize Gaussian noise
+        generator = (
+            None if seed is None else torch.Generator(
+                rand_device).manual_seed(seed)
+        )
+        # TODO multi-res noise
+        noise = torch.randn(
+            shape, generator=generator, device=rand_device, dtype=rand_torch_dtype
+        )
+        noise = noise.to(
+            dtype=torch_dtype or self.torch_dtype, device=device or self.device
+        )
+        return noise
+    def enable_cpu_offload(self):
+        warnings.warn(
+            "`enable_cpu_offload` will be deprecated. Please use `enable_vram_management`."
+        )
+        self.vram_management_enabled = True
+    def get_vram(self):
+        return torch.cuda.mem_get_info(self.device)[1] / (1024**3)
+    def freeze_except(self, model_names):
+        for name, model in self.named_children():
+            if name in model_names:
+                print(f"Unfreezing model {name}.")
+                print(
+                    f"Model parameters: {sum(p.numel() for p in model.parameters())}")
+                model.train()
+                model.requires_grad_(True)
+            else:
+                print(f"Freezing model {name}.")
+                print(
+                    f"Model parameters: {sum(p.numel() for p in model.parameters())}")
+                model.eval()
+                model.requires_grad_(False)
+@dataclass
+class ModelConfig:
+    path: Union[str, list[str]] = None
+    model_id: str = None
+    origin_file_pattern: Union[str, list[str]] = None
+    download_resource: str = "ModelScope"
+    offload_device: Optional[Union[str, torch.device]] = None
+    offload_dtype: Optional[torch.dtype] = None
+    def download_if_necessary(
+        self, local_model_path="./models", skip_download=False, use_usp=False
+    ):
+        if self.path is None:
+            # Check model_id and origin_file_pattern
+            if self.model_id is None:
+                raise ValueError(
+                    f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`."""
+                )
+            # Skip if not in rank 0
+            if use_usp:
+                import torch.distributed as dist
+                skip_download = dist.get_rank() != 0
+            # Check whether the origin path is a folder
+            if self.origin_file_pattern is None or self.origin_file_pattern == "":
+                self.origin_file_pattern = ""
+                allow_file_pattern = None
+                is_folder = True
+            elif isinstance(
+                self.origin_file_pattern, str
+            ) and self.origin_file_pattern.endswith("/"):
+                allow_file_pattern = self.origin_file_pattern + "*"
+                is_folder = True
+            else:
+                allow_file_pattern = self.origin_file_pattern
+                is_folder = False
+            # Download
+            if not skip_download:
+                downloaded_files = glob.glob(
+                    self.origin_file_pattern,
+                    root_dir=os.path.join(local_model_path, self.model_id),
+                )
+                # snapshot_download(
+                #     self.model_id,
+                #     local_dir=os.path.join(local_model_path, self.model_id),
+                #     allow_file_pattern=allow_file_pattern,
+                #     ignore_file_pattern=downloaded_files,
+                #     local_files_only=False,
+                # )
+                snapshot_download(
+                    self.model_id,
+                    repo_type="model",  # 如果是dataset要改成"dataset"
+                    local_dir=os.path.join(local_model_path, self.model_id),
+                    allow_patterns=allow_file_pattern,
+                    ignore_patterns=downloaded_files,    # 注意这里是 patterns
+                    # ignore_file_pattern=downloaded_files,
+                    # local_files_only=False,
+                    local_files_only=False,
+                    resume_download=True,   # 支持断点续传
+                )
+            # Let rank 1, 2, ... wait for rank 0
+            if use_usp:
+                import torch.distributed as dist
+                dist.barrier(device_ids=[dist.get_rank()])
+            # Return downloaded files
+            if is_folder:
+                self.path = os.path.join(
+                    local_model_path, self.model_id, self.origin_file_pattern
+                )
+            else:
+                self.path = glob.glob(
+                    os.path.join(
+                        local_model_path, self.model_id, self.origin_file_pattern
+                    )
+                )
+            if isinstance(self.path, list) and len(self.path) == 1:
+                self.path = self.path[0]
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.bfloat16, tokenizer_path=None):
+        super().__init__(
+            device=device,
+            torch_dtype=torch_dtype,
+            height_division_factor=16,
+            width_division_factor=16,
+            time_division_factor=4,
+            time_division_remainder=1,
+        )
+        self.scheduler = FlowMatchScheduler(
+            shift=5, sigma_min=0.0, extra_one_step=True)
+        # self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        # self.pose_encoder: CameraPoseEncoder = None
+        self.dit: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.motion_controller: WanMotionControllerModel = None
+        self.vace: VaceWanModel = None
+        self.in_iteration_models = ("dit", "motion_controller", "vace")
+        self.unit_runner = PipelineUnitRunner()
+        self.units = [
+            WanVideoUnit_ShapeChecker(),  # check if the shape if ok
+            # WanVideoUnit_NoiseInitializer(),
+            WanVideoUnit_InputVideoEmbedder(),
+            WanVideoUnit_PromptEmbedder(),
+            WanVideoUnit_ImageEmbedder(),
+            # WanVideoUnit_FunReference(),
+            # WanVideoUnit_CameraPoseEmbedder(),
+            # WanVideoUnit_SpeedControl(),
+            # WanVideoUnit_VACE(),
+            WanVideoUnit_UnifiedSequenceParallel(),
+            # WanVideoUnit_TeaCache(),
+            # WanVideoUnit_CfgMerger(),
+        ]
+        self.model_fn = model_fn_wan_video
+    def training_predict(self, **inputs):
+        timestep_id = torch.tensor([0])
+        # print(f"timestep_id: {timestep_id}")
+        timestep = self.scheduler.timesteps[timestep_id].to(
+            dtype=self.torch_dtype, device=self.device
+        )
+        # print(f"Selected timestep {timestep}")
+        inputs["latents"] = inputs['rgb_latents']
+        training_target = self.scheduler.training_target(
+            inputs["depth_latents"], inputs["rgb_latents"], timestep
+        )
+        noise_pred = self.model_fn(**inputs, timestep=timestep)
+        return {
+            'rgb_gt': inputs['rgb_latents'],
+            "depth_gt": training_target,
+            "pred": noise_pred,
+            "weight": self.scheduler.training_weight(timestep),
+        }
+    def enable_vram_management(
+        self, num_persistent_param_in_dit=None, vram_limit=None, vram_buffer=0.5
+    ):
+        self.vram_management_enabled = True
+        if num_persistent_param_in_dit is not None:
+            vram_limit = None
+        else:
+            if vram_limit is None:
+                vram_limit = self.get_vram()
+            vram_limit = vram_limit - vram_buffer
+        if self.text_encoder is not None:
+            dtype = next(iter(self.text_encoder.parameters())).dtype
+            enable_vram_management(
+                self.text_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Embedding: AutoWrappedModule,
+                    T5RelativeEmbedding: AutoWrappedModule,
+                    T5LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.dit is not None:
+            dtype = next(iter(self.dit.parameters())).dtype
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.dit,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: WanAutoCastLayerNorm,
+                    RMSNorm: AutoWrappedModule,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                max_num_param=num_persistent_param_in_dit,
+                overflow_module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+        if self.vae is not None:
+            dtype = next(iter(self.vae.parameters())).dtype
+            enable_vram_management(
+                self.vae,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    RMS_norm: AutoWrappedModule,
+                    CausalConv3d: AutoWrappedModule,
+                    Upsample: AutoWrappedModule,
+                    torch.nn.SiLU: AutoWrappedModule,
+                    torch.nn.Dropout: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=self.device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.motion_controller is not None:
+            dtype = next(iter(self.motion_controller.parameters())).dtype
+            enable_vram_management(
+                self.motion_controller,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=dtype,
+                    computation_device=self.device,
+                ),
+            )
+        if self.vace is not None:
+            device = "cpu" if vram_limit is not None else self.device
+            enable_vram_management(
+                self.vace,
+                module_map={
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv3d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                    RMSNorm: AutoWrappedModule,
+                },
+                module_config=dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device=device,
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+                vram_limit=vram_limit,
+            )
+    def initialize_usp(self):
+        import torch.distributed as dist
+        from xfuser.core.distributed import (init_distributed_environment,
+                                             initialize_model_parallel)
+        dist.init_process_group(backend="nccl", init_method="env://")
+        init_distributed_environment(
+            rank=dist.get_rank(), world_size=dist.get_world_size()
+        )
+        initialize_model_parallel(
+            sequence_parallel_degree=dist.get_world_size(),
+            ring_degree=1,
+            ulysses_degree=dist.get_world_size(),
+        )
+        torch.cuda.set_device(dist.get_rank())
+    def enable_usp(self):
+        from xfuser.core.distributed import get_sequence_parallel_world_size
+        from ..distributed.xdit_context_parallel import (usp_attn_forward,
+                                                         usp_dit_forward)
+        for block in self.dit.blocks:
+            block.self_attn.forward = types.MethodType(
+                usp_attn_forward, block.self_attn
+            )
+        self.dit.forward = types.MethodType(usp_dit_forward, self.dit)
+        self.sp_size = get_sequence_parallel_world_size()
+        self.use_unified_sequence_parallel = True
+    @staticmethod
+    def from_pretrained(
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device: Union[str, torch.device] = "cuda",
+        model_configs: list[ModelConfig] = [],
+        tokenizer_config: ModelConfig = ModelConfig(
+            model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="google/*"
+        ),
+        local_model_path: str = "./models",
+        skip_download: bool = False,
+        redirect_common_files: bool = True,
+        use_usp=False,
+    ):
+        # Redirect model path
+        if redirect_common_files:
+            redirect_dict = {
+                "models_t5_umt5-xxl-enc-bf16.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "Wan2.1_VAE.pth": "Wan-AI/Wan2.1-T2V-1.3B",
+                "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth": "Wan-AI/Wan2.1-I2V-14B-480P",
+            }
+            for model_config in model_configs:
+                if (
+                    model_config.origin_file_pattern is None
+                    or model_config.model_id is None
+                ):
+                    continue
+                if (
+                    model_config.origin_file_pattern in redirect_dict
+                    and model_config.model_id
+                    != redirect_dict[model_config.origin_file_pattern]
+                ):
+                    print(
+                        f"To avoid repeatedly downloading model files, ({model_config.model_id}, {model_config.origin_file_pattern}) is redirected to ({redirect_dict[model_config.origin_file_pattern]}, {model_config.origin_file_pattern}). You can use `redirect_common_files=False` to disable file redirection."
+                    )
+                    model_config.model_id = redirect_dict[
+                        model_config.origin_file_pattern
+                    ]
+        # Initialize pipeline
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        if use_usp:
+            pipe.initialize_usp()
+        # Download and load models
+        model_manager = ModelManager()
+        for model_config in model_configs:
+            model_config.download_if_necessary(
+                local_model_path, skip_download=skip_download, use_usp=use_usp
+            )
+            model_manager.load_model(
+                model_config.path,
+                device=model_config.offload_device or device,
+                torch_dtype=model_config.offload_dtype or torch_dtype,
+            )
+        # Load models
+        # pipe.text_encoder = model_manager.fetch_model("wan_video_text_encoder")
+        pipe.dit = model_manager.fetch_model("wan_video_dit")
+        pipe.vae = model_manager.fetch_model("wan_video_vae")
+        pipe.image_encoder = model_manager.fetch_model(
+            "wan_video_image_encoder")
+        pipe.motion_controller = model_manager.fetch_model(
+            "wan_video_motion_controller"
+        )
+        pipe.vace = model_manager.fetch_model("wan_video_vace")
+        # Initialize tokenizer
+        tokenizer_config.download_if_necessary(
+            local_model_path, skip_download=skip_download
+        )
+        # pipe.prompter.fetch_models(pipe.text_encoder)
+        # pipe.prompter.fetch_tokenizer(tokenizer_config.path)
+        # Unified Sequence Parallel
+        if use_usp:
+            pipe.enable_usp()
+        return pipe
+    # @torch.no_grad()
+    @torch.inference_mode()
+    def __call__(
+        self,
+        # Prompt
+        prompt: str,
+        negative_prompt: Optional[str] = "",
+        # Image-to-video
+        input_image: Optional[Image.Image] = None,
+        # First-last-frame-to-video
+        end_image: Optional[Image.Image] = None,
+        # Video-to-video
+        input_video: Optional[list[Image.Image]] = None,
+        denoising_strength: Optional[float] = 1.0,
+        # ControlNet
+        reference_image: Optional[Image.Image] = None,
+        extra_images: Optional[List[Image.Image]] = None,
+        extra_image_frame_index: Optional[List[int]] = None,
+        # VACE
+        vace_video: Optional[list[Image.Image]] = None,
+        vace_video_mask: Optional[Image.Image] = None,
+        vace_reference_image: Optional[Image.Image] = None,
+        vace_scale: Optional[float] = 1.0,
+        # Randomness
+        seed: Optional[int] = None,
+        rand_device: Optional[str] = "cpu",
+        # Shape
+        mode: Optional[str] = "regression",
+        batch_size: Optional[int] = 1,
+        height: Optional[int] = 480,
+        width: Optional[int] = 720,
+        frame_mask: Optional[torch.Tensor] = None,
+        num_frames=41,
+        # Classifier-free guidance
+        cfg_scale: Optional[float] = 1,
+        cfg_merge: Optional[bool] = False,
+        # Scheduler
+        num_inference_steps: Optional[int] = 1,
+        sigma_shift: Optional[float] = 5.0,
+        denoise_step=1,
+        # Speed control
+        motion_bucket_id: Optional[int] = None,
+        # VAE tiling
+        tiled: Optional[bool] = False,
+        tile_size: Optional[tuple[int, int]] = (30, 52),
+        tile_stride: Optional[tuple[int, int]] = (15, 26),
+        # Sliding window
+        sliding_window_size: Optional[int] = None,
+        sliding_window_stride: Optional[int] = None,
+        # Teacache
+        tea_cache_l1_thresh: Optional[float] = None,
+        tea_cache_model_id: Optional[str] = "",
+        # progress_bar
+        progress_bar_cmd=tqdm,
+    ):
+        self.scheduler.set_timesteps(
+            num_inference_steps=num_inference_steps,
+            denoising_strength=denoising_strength,
+            shift=sigma_shift,
+            denoise_step=denoise_step,
+        )
+        # Inputs
+        inputs_posi = {
+            "prompt": prompt,
+            "prompt_num": batch_size,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_nega = {
+            "negative_prompt": negative_prompt,
+            "prompt_num": batch_size,
+            "tea_cache_l1_thresh": tea_cache_l1_thresh,
+            "tea_cache_model_id": tea_cache_model_id,
+            "num_inference_steps": num_inference_steps,
+        }
+        inputs_shared = {
+            "batch_size": batch_size,
+            "input_image": input_image,
+            "end_image": end_image,
+            "input_video": input_video,
+            "denoising_strength": denoising_strength,
+            "reference_image": reference_image,
+            "vace_video": vace_video,
+            "vace_video_mask": vace_video_mask,
+            "vace_reference_image": vace_reference_image,
+            "vace_scale": vace_scale,
+            "seed": seed,
+            "rand_device": rand_device,
+            'mode': mode,
+            "height": height,
+            "width": width,
+            "frame_mask": frame_mask,
+            "num_frames": num_frames,
+            "cfg_scale": cfg_scale,
+            "cfg_merge": cfg_merge,
+            "sigma_shift": sigma_shift,
+            "motion_bucket_id": motion_bucket_id,
+            "tiled": tiled,
+            "tile_size": tile_size,
+            "tile_stride": tile_stride,
+            "sliding_window_size": sliding_window_size,
+            "sliding_window_stride": sliding_window_stride,
+            "extra_images": extra_images,
+            "extra_image_frame_index": extra_image_frame_index,
+        }
+        for unit in self.units:
+            inputs_shared, inputs_posi, inputs_nega = self.unit_runner(
+                unit, self, inputs_shared, inputs_posi, inputs_nega
+            )
+        models = {name: getattr(self, name)
+                  for name in self.in_iteration_models}
+        for timestep in self.scheduler.timesteps:
+            timestep = timestep.unsqueeze(0).to(
+                dtype=self.torch_dtype, device=self.device
+            )
+            # torch.cuda.synchronize()
+            # start_time = time.time()
+            noise_pred_posi = self.model_fn(
+                **models, **inputs_shared, **inputs_posi, timestep=timestep
+            )
+            # torch.cuda.synchronize()
+            # end_time = time.time()
+            # print(f"Model forward time: {end_time - start_time}")
+            noise_pred = noise_pred_posi
+            inputs_shared["latents"] = self.scheduler.step(
+                model_output=noise_pred,
+                sample=inputs_shared["latents"],
+            )
+        rgb, depth = None, None
+        if isinstance(inputs_shared['latents'], tuple):
+            rgb, depth = inputs_shared['latents']
+        else:
+            depth = inputs_shared['latents']
+        # VACE (TODO: remove it)
+        if vace_reference_image is not None:
+            inputs_shared["latents"] = inputs_shared["latents"][:, :, 1:]
+        # torch.cuda.synchronize()
+        # start_time = time.time()
+        depth_video = self.vae.decode(
+            depth,
+            device=self.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        # torch.cuda.synchronize()
+        # end_time = time.time()
+        # print(f"VAE decoding time: {end_time - start_time}")
+        depth_video = self.vae_output_to_video(depth_video)
+        rgb_video = None
+        if rgb is not None:
+            rgb_video = self.vae.decode(
+                depth,
+                device=self.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            )
+            rgb_video = self.vae_output_to_video(rgb_video)
+        return {
+            'depth': depth_video,
+            'rgb': rgb_video
+        }
+class PipelineUnit:
+    def __init__(
+        self,
+        seperate_cfg: bool = False,
+        take_over: bool = False,
+        input_params: tuple[str] = None,
+        input_params_posi: dict[str, str] = None,
+        input_params_nega: dict[str, str] = None,
+        onload_model_names: tuple[str] = None,
+    ):
+        self.seperate_cfg = seperate_cfg
+        self.take_over = take_over
+        self.input_params = input_params
+        self.input_params_posi = input_params_posi
+        self.input_params_nega = input_params_nega
+        self.onload_model_names = onload_model_names
+    def process(
+        self, pipe: WanVideoPipeline, inputs: dict, positive=True, **kwargs
+    ) -> dict:
+        raise NotImplementedError("`process` is not implemented.")
+class PipelineUnitRunner:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        unit: PipelineUnit,
+        pipe: WanVideoPipeline,
+        inputs_shared: dict,
+        inputs_posi: dict,
+        inputs_nega: dict,
+    ) -> tuple[dict, dict]:
+        if unit.take_over:
+            # Let the pipeline unit take over this function.
+            inputs_shared, inputs_posi, inputs_nega = unit.process(
+                pipe,
+                inputs_shared=inputs_shared,
+                inputs_posi=inputs_posi,
+                inputs_nega=inputs_nega,
+            )
+        elif unit.seperate_cfg:
+            # Positive side
+            processor_inputs = {
+                name: inputs_posi.get(name_)
+                for name, name_ in unit.input_params_posi.items()
+            }
+            if unit.input_params is not None:
+                for name in unit.input_params:
+                    processor_inputs[name] = inputs_shared.get(name)
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_posi.update(processor_outputs)
+            # Negative side
+            if inputs_shared["cfg_scale"] != 1:
+                processor_inputs = {
+                    name: inputs_nega.get(name_)
+                    for name, name_ in unit.input_params_nega.items()
+                }
+                if unit.input_params is not None:
+                    for name in unit.input_params:
+                        processor_inputs[name] = inputs_shared.get(name)
+                processor_outputs = unit.process(pipe, **processor_inputs)
+                inputs_nega.update(processor_outputs)
+            else:
+                inputs_nega.update(processor_outputs)
+        else:
+            processor_inputs = {
+                name: inputs_shared.get(name) for name in unit.input_params
+            }
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_shared.update(processor_outputs)
+        return inputs_shared, inputs_posi, inputs_nega
+class WanVideoUnit_ShapeChecker(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=("height", "width", "num_frames"))
+    def process(self, pipe: WanVideoPipeline, height, width, num_frames):
+        #     print(
+        #         f"Init WanVideoPipeline with height={height}, width={width}, num_frames={num_frames}."
+        #     )
+        height, width, num_frames = pipe.check_resize_height_width(
+            height, width, num_frames
+        )
+        # print(
+        #     f"Resized WanVideoPipeline to height={height}, width={width}, num_frames={num_frames}."
+        # )
+        return {"height": height, "width": width, "num_frames": num_frames}
+class WanVideoUnit_NoiseInitializer(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "batch_size",
+                "height",
+                "width",
+                "num_frames",
+                "seed",
+                "rand_device",
+                "vace_reference_image",
+            )
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        batch_size,
+        height,
+        width,
+        num_frames,
+        seed,
+        rand_device,
+        vace_reference_image,
+    ):
+        # print(f"num frames {num_frames}")
+        length = (num_frames - 1) // 4 + 1
+        if vace_reference_image is not None:
+            length += 1
+        # TODO
+        noise = pipe.generate_noise(
+            (batch_size, 16, length, height // 8, width // 8),
+            seed=seed,
+            rand_device=rand_device,
+        )
+        # print(f"Noise shape {noise.shape} ")
+        return {"noise": noise, "latents": noise}
+class WanVideoUnit_InputVideoEmbedder(PipelineUnit):  # For training only
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                'mode',
+                'seed',
+                'rand_device',
+                "batch_size",
+                "height",
+                "width",
+                "num_frames",
+                "input_video",
+                "input_disp",
+                "noise",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "vace_reference_image",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe,
+        mode,
+        seed,
+        rand_device,
+        batch_size,
+        height,
+        width,
+        num_frames,
+        input_video,
+        input_disp,
+        noise,
+        tiled,
+        tile_size,
+        tile_stride,
+        vace_reference_image,
+    ):
+        assert mode in ['generation',
+                        'regression'], f"mode {mode} is not supported"
+        length = (num_frames - 1) // 4 + 1
+        # inference part
+        if not pipe.scheduler.training:
+            if mode == 'generation':
+                # only need noise
+                noise = pipe.generate_noise(
+                    (batch_size, 16, length, height // 8, width // 8),
+                    seed=seed,
+                    rand_device=rand_device,
+                )
+                return {'latents': noise}
+            else:
+                # only need rgb latent
+                video_list = []
+                for _input_video in input_video:
+                    _preprocessed_video = pipe.preprocess_video(_input_video)
+                    video_list.append(_preprocessed_video)
+                videos_tensor = torch.cat(video_list, dim=0)
+                # print(f"videos_tensor shape: {videos_tensor.shape}")
+                input_rgb_latents = pipe.vae.encode(
+                    videos_tensor,
+                    device=pipe.device,
+                    tiled=tiled,
+                    tile_size=tile_size,
+                    tile_stride=tile_stride,
+                ).to(dtype=pipe.torch_dtype, device=pipe.device)
+                return {"latents": input_rgb_latents}
+        disp_list = []
+        for _input_disp in input_disp:
+            _preprocessed_disp = pipe.preprocess_video(_input_disp)
+            disp_list.append(_preprocessed_disp)
+        disp_tensor = torch.cat(disp_list, dim=0)
+        input_disp_latents = pipe.vae.encode(
+            disp_tensor,
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        ).to(dtype=pipe.torch_dtype, device=pipe.device)
+        # Training
+        if mode == 'generation':
+            # need noise + depth
+            noise = pipe.generate_noise(
+                (batch_size, 16, length, height // 8, width // 8),
+                seed=seed,
+                rand_device=rand_device,
+            )
+            return {'rgb_latents': noise, 'depth_latents': input_disp_latents}
+        else:
+            # need rgb + depth
+            video_list = []
+            for _input_video in input_video:
+                _preprocessed_video = pipe.preprocess_video(_input_video)
+                video_list.append(_preprocessed_video)
+            videos_tensor = torch.cat(video_list, dim=0)
+            input_rgb_latents = pipe.vae.encode(
+                videos_tensor,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            # del videos_tensor
+            return {
+                "rgb_latents": input_rgb_latents,
+                "depth_latents": input_disp_latents,
+            }
+class WanVideoUnit_PromptEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            seperate_cfg=True,
+            input_params_posi={
+                "prompt": "prompt",
+                "positive": "positive",
+                "prompt_num": "prompt_num",
+            },
+            input_params_nega={
+                "prompt": "negative_prompt",
+                "positive": "positive",
+                "prompt_num": "prompt_num",
+            },
+            onload_model_names=("text_encoder",),
+        )
+    def process(self, pipe: WanVideoPipeline, prompt, positive, prompt_num) -> dict:
+        # pipe.load_models_to_device(self.onload_model_names)
+        prompt_emb = []
+        # print(f"Encoding prompt: {prompt}")
+        # if isinstance(prompt, str):
+        #     prompt = [prompt] * prompt_num
+        # prompt_emb = None
+        # for _prompt in prompt:
+        #     _prompt_emb = pipe.prompter.encode_prompt(
+        #         _prompt, positive=positive, device=pipe.device
+        #     )
+        #     prompt_emb = _prompt_emb
+        #     break
+        # prompt_emb = prompt_emb.repeat(prompt_num,1,1)
+        # # prompt_emb = torch.cat(prompt_emb, dim=0)
+        # prompt_emb = prompt_emb.to(dtype=pipe.torch_dtype, device=pipe.device)
+        # print(f"Prompt embedding shape: {prompt_emb.shape}")
+        zero_pad = torch.zeros([prompt_num, 512, 4096])
+        zero_pad = zero_pad.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"context": zero_pad}
+        # return {"context": prompt_emb}
+class WanVideoUnit_ImageEmbedder(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "input_image",
+                "end_image",
+                "num_frames",
+                "height",
+                "width",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+                "extra_images",
+                "extra_image_frame_index",
+            ),
+            onload_model_names=("image_encoder", "vae"),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        input_image,
+        end_image,
+        num_frames,
+        height,
+        width,
+        tiled,
+        tile_size,
+        tile_stride,
+        extra_images,
+        extra_image_frame_index,
+    ):
+        # print(f"input image shape{input_image.shape} ")
+        if not pipe.dit.has_image_input:
+            return {}
+        if input_image is None:
+            return {}
+        # pipe.load_models_to_device(self.onload_model_names)
+        image = pipe.preprocess_image(input_image).to(pipe.device)  # B C H W
+        batch_size = image.shape[0]
+        clip_context = pipe.image_encoder.encode_image([image])
+        msk = torch.ones(
+            batch_size, num_frames, height // 8, width // 8, device=pipe.device
+        )
+        # print(
+        #     f"tiled, tile size, tile stride: {tiled}, {tile_size}, {tile_stride}")
+        # Assmue that one must have a input image
+        vae_input = torch.concat(
+            [
+                image.unsqueeze(2),  # B C 1 H W
+                torch.zeros(batch_size, 3, num_frames - 1, height, width).to(
+                    image.device
+                ),
+            ],
+            dim=2,
+        )  # B C F H W
+        vae_input = vae_input.permute(0, 2, 1, 3, 4).contiguous()  # B F C H W
+        if (
+            extra_images is not None
+            and extra_image_frame_index is not None
+        ):
+            # print(f"extra images shape {extra_images.shape}")
+            for _videoid, _video in enumerate(extra_images):
+                # _video F C H W
+                for idx, image in enumerate(_video):
+                    if idx == 0:
+                        continue
+                    image = pipe.preprocess_image(
+                        image).to(pipe.device)  # 1 C H W
+                    vae_input[_videoid, idx] = image.squeeze(0)
+            mask = extra_image_frame_index[:, :, None, None].to(
+                pipe.device)  # B F 1 1
+            mask = mask.expand(
+                batch_size, mask.shape[1], height // 8, width // 8
+            )  # B F H W
+            msk = msk * mask
+        else:
+            msk[:, 1:] = 0
+        msk = torch.concat(
+            [torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1
+        )
+        msk = msk.view(
+            batch_size, msk.shape[1] // 4, 4, height // 8, width // 8
+        )  # B F C(4) H W
+        msk = msk.transpose(1, 2)
+        vae_input = vae_input.permute(0, 2, 1, 3, 4).contiguous()  # B C F H W
+        y = pipe.vae.encode(
+            vae_input.to(dtype=pipe.torch_dtype, device=pipe.device),
+            device=pipe.device,
+            tiled=tiled,
+            tile_size=tile_size,
+            tile_stride=tile_stride,
+        )
+        # print(f"y shape after VAE encode: {y.shape}")
+        # print(f"after VAE encode, y shape: {y.shape}")
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        # print()
+        y = torch.concat([msk, y], dim=1)  # B 16+4 F H W
+        # print(f"after concat, y shape: {y.shape}")
+        # y = y.unsqueeze(0)
+        clip_context = clip_context.to(
+            dtype=pipe.torch_dtype, device=pipe.device)
+        # print(f"clip context shape: {clip_context.shape}")
+        y = y.to(dtype=pipe.torch_dtype, device=pipe.device)
+        return {"clip_feature": clip_context, "y": y}
+class WanVideoUnit_VACE(PipelineUnit):
+    def __init__(self):
+        super().__init__(
+            input_params=(
+                "vace_video",
+                "vace_video_mask",
+                "vace_reference_image",
+                "vace_scale",
+                "height",
+                "width",
+                "num_frames",
+                "tiled",
+                "tile_size",
+                "tile_stride",
+            ),
+            onload_model_names=("vae",),
+        )
+    def process(
+        self,
+        pipe: WanVideoPipeline,
+        vace_video,
+        vace_video_mask,
+        vace_reference_image,
+        vace_scale,
+        height,
+        width,
+        num_frames,
+        tiled,
+        tile_size,
+        tile_stride,
+    ):
+        if (
+            vace_video is not None
+            or vace_video_mask is not None
+            or vace_reference_image is not None
+        ):
+            # pipe.load_models_to_device(["vae"])
+            if vace_video is None:
+                vace_video = torch.zeros(
+                    (1, 3, num_frames, height, width),
+                    dtype=pipe.torch_dtype,
+                    device=pipe.device,
+                )
+            else:
+                vace_video = pipe.preprocess_video(vace_video)
+            if vace_video_mask is None:
+                vace_video_mask = torch.ones_like(vace_video)
+            else:
+                vace_video_mask = pipe.preprocess_video(
+                    vace_video_mask, min_value=0, max_value=1
+                )
+            inactive = vace_video * (1 - vace_video_mask) + 0 * vace_video_mask
+            reactive = vace_video * vace_video_mask + 0 * (1 - vace_video_mask)
+            inactive = pipe.vae.encode(
+                inactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            reactive = pipe.vae.encode(
+                reactive,
+                device=pipe.device,
+                tiled=tiled,
+                tile_size=tile_size,
+                tile_stride=tile_stride,
+            ).to(dtype=pipe.torch_dtype, device=pipe.device)
+            vace_video_latents = torch.concat((inactive, reactive), dim=1)
+            vace_mask_latents = rearrange(
+                vace_video_mask[0, 0], "T (H P) (W Q) -> 1 (P Q) T H W", P=8, Q=8
+            )
+            vace_mask_latents = torch.nn.functional.interpolate(
+                vace_mask_latents,
+                size=(
+                    (vace_mask_latents.shape[2] + 3) // 4,
+                    vace_mask_latents.shape[3],
+                    vace_mask_latents.shape[4],
+                ),
+                mode="nearest-exact",
+            )
+            if vace_reference_image is None:
+                pass
+            else:
+                vace_reference_image = pipe.preprocess_video(
+                    [vace_reference_image])
+                vace_reference_latents = pipe.vae.encode(
+                    vace_reference_image,
+                    device=pipe.device,
+                    tiled=tiled,
+                    tile_size=tile_size,
+                    tile_stride=tile_stride,
+                ).to(dtype=pipe.torch_dtype, device=pipe.device)
+                vace_reference_latents = torch.concat(
+                    (vace_reference_latents, torch.zeros_like(
+                        vace_reference_latents)),
+                    dim=1,
+                )
+                vace_video_latents = torch.concat(
+                    (vace_reference_latents, vace_video_latents), dim=2
+                )
+                vace_mask_latents = torch.concat(
+                    (torch.zeros_like(
+                        vace_mask_latents[:, :, :1]), vace_mask_latents),
+                    dim=2,
+                )
+            vace_context = torch.concat(
+                (vace_video_latents, vace_mask_latents), dim=1)
+            return {"vace_context": vace_context, "vace_scale": vace_scale}
+        else:
+            # print(f"No VACE video, mask or reference image provided, skipping VACE.")
+            return {"vace_context": None, "vace_scale": vace_scale}
+class WanVideoUnit_UnifiedSequenceParallel(PipelineUnit):
+    def __init__(self):
+        super().__init__(input_params=())
+    def process(self, pipe: WanVideoPipeline):
+        if hasattr(pipe, "use_unified_sequence_parallel"):
+            if pipe.use_unified_sequence_parallel:
+                return {"use_unified_sequence_parallel": True}
+        return {}
+class WanVideoUnit_CfgMerger(PipelineUnit):
+    def __init__(self):
+        super().__init__(take_over=True)
+        self.concat_tensor_names = ["context",
+                                    "clip_feature", "y", "reference_latents"]
+    def process(self, pipe: WanVideoPipeline, inputs_shared, inputs_posi, inputs_nega):
+        if not inputs_shared["cfg_merge"]:
+            # print(f"Skipping CFG merge, cfg_merge is set to False.")
+            return inputs_shared, inputs_posi, inputs_nega
+        for name in self.concat_tensor_names:
+            tensor_posi = inputs_posi.get(name)
+            tensor_nega = inputs_nega.get(name)
+            tensor_shared = inputs_shared.get(name)
+            if tensor_posi is not None and tensor_nega is not None:
+                inputs_shared[name] = torch.concat(
+                    (tensor_posi, tensor_nega), dim=0)
+            elif tensor_shared is not None:
+                inputs_shared[name] = torch.concat(
+                    (tensor_shared, tensor_shared), dim=0
+                )
+        inputs_posi.clear()
+        inputs_nega.clear()
+        return inputs_shared, inputs_posi, inputs_nega
+class TeaCache:
+    def __init__(self, num_inference_steps, rel_l1_thresh, model_id):
+        self.num_inference_steps = num_inference_steps
+        self.step = 0
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.rel_l1_thresh = rel_l1_thresh
+        self.previous_residual = None
+        self.previous_hidden_states = None
+        self.coefficients_dict = {
+            "Wan2.1-T2V-1.3B": [
+                -5.21862437e04,
+                9.23041404e03,
+                -5.28275948e02,
+                1.36987616e01,
+                -4.99875664e-02,
+            ],
+            "Wan2.1-T2V-14B": [
+                -3.03318725e05,
+                4.90537029e04,
+                -2.65530556e03,
+                5.87365115e01,
+                -3.15583525e-01,
+            ],
+            "Wan2.1-I2V-14B-480P": [
+                2.57151496e05,
+                -3.54229917e04,
+                1.40286849e03,
+                -1.35890334e01,
+                1.32517977e-01,
+            ],
+            "Wan2.1-I2V-14B-720P": [
+                8.10705460e03,
+                2.13393892e03,
+                -3.72934672e02,
+                1.66203073e01,
+                -4.17769401e-02,
+            ],
+        }
+        if model_id not in self.coefficients_dict:
+            supported_model_ids = ", ".join(
+                [i for i in self.coefficients_dict])
+            raise ValueError(
+                f"{model_id} is not a supported TeaCache model id. Please choose a valid model id in ({supported_model_ids})."
+            )
+        self.coefficients = self.coefficients_dict[model_id]
+    def check(self, dit: WanModel, x, t_mod):
+        modulated_inp = t_mod.clone()
+        if self.step == 0 or self.step == self.num_inference_steps - 1:
+            should_calc = True
+            self.accumulated_rel_l1_distance = 0
+        else:
+            coefficients = self.coefficients
+            rescale_func = np.poly1d(coefficients)
+            self.accumulated_rel_l1_distance += rescale_func(
+                (
+                    (modulated_inp - self.previous_modulated_input).abs().mean()
+                    / self.previous_modulated_input.abs().mean()
+                )
+                .cpu()
+                .item()
+            )
+            if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                should_calc = False
+            else:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = modulated_inp
+        self.step += 1
+        if self.step == self.num_inference_steps:
+            self.step = 0
+        if should_calc:
+            self.previous_hidden_states = x.clone()
+        return not should_calc
+    def store(self, hidden_states):
+        self.previous_residual = hidden_states - self.previous_hidden_states
+        self.previous_hidden_states = None
+    def update(self, hidden_states):
+        hidden_states = hidden_states + self.previous_residual
+        return hidden_states
+class TemporalTiler_BCTHW:
+    def __init__(self):
+        pass
+    def build_1d_mask(self, length, left_bound, right_bound, border_width):
+        x = torch.ones((length,))
+        if not left_bound:
+            x[:border_width] = (torch.arange(border_width) + 1) / border_width
+        if not right_bound:
+            x[-border_width:] = torch.flip(
+                (torch.arange(border_width) + 1) / border_width, dims=(0,)
+            )
+        return x
+    def build_mask(self, data, is_bound, border_width):
+        _, _, T, _, _ = data.shape
+        t = self.build_1d_mask(T, is_bound[0], is_bound[1], border_width[0])
+        mask = repeat(t, "T -> 1 1 T 1 1")
+        return mask
+    def run(
+        self,
+        model_fn,
+        sliding_window_size,
+        sliding_window_stride,
+        computation_device,
+        computation_dtype,
+        model_kwargs,
+        tensor_names,
+        batch_size=None,
+    ):
+        tensor_names = [
+            tensor_name
+            for tensor_name in tensor_names
+            if model_kwargs.get(tensor_name) is not None
+        ]
+        tensor_dict = {
+            tensor_name: model_kwargs[tensor_name] for tensor_name in tensor_names
+        }
+        B, C, T, H, W = tensor_dict[tensor_names[0]].shape
+        if batch_size is not None:
+            B *= batch_size
+        data_device, data_dtype = (
+            tensor_dict[tensor_names[0]].device,
+            tensor_dict[tensor_names[0]].dtype,
+        )
+        value = torch.zeros(
+            (B, C, T, H, W), device=data_device, dtype=data_dtype)
+        weight = torch.zeros(
+            (1, 1, T, 1, 1), device=data_device, dtype=data_dtype)
+        for t in range(0, T, sliding_window_stride):
+            if (
+                t - sliding_window_stride >= 0
+                and t - sliding_window_stride + sliding_window_size >= T
+            ):
+                continue
+            t_ = min(t + sliding_window_size, T)
+            model_kwargs.update(
+                {
+                    tensor_name: tensor_dict[tensor_name][:, :, t:t_:, :].to(
+                        device=computation_device, dtype=computation_dtype
+                    )
+                    for tensor_name in tensor_names
+                }
+            )
+            model_output = model_fn(**model_kwargs).to(
+                device=data_device, dtype=data_dtype
+            )
+            mask = self.build_mask(
+                model_output,
+                is_bound=(t == 0, t_ == T),
+                border_width=(sliding_window_size - sliding_window_stride,),
+            ).to(device=data_device, dtype=data_dtype)
+            value[:, :, t:t_, :, :] += model_output * mask
+            weight[:, :, t:t_, :, :] += mask
+        value /= weight
+        model_kwargs.update(tensor_dict)
+        return value
+def model_fn_wan_video(
+    dit: WanModel,
+    motion_controller: WanMotionControllerModel = None,
+    vace: VaceWanModel = None,
+    latents: torch.Tensor = None,
+    timestep: torch.Tensor = None,
+    context: torch.Tensor = None,
+    clip_feature: Optional[torch.Tensor] = None,
+    y: Optional[torch.Tensor] = None,
+    reference_latents=None,
+    vace_context=None,
+    vace_scale=1.0,
+    tea_cache: TeaCache = None,
+    use_unified_sequence_parallel: bool = False,
+    motion_bucket_id: Optional[torch.Tensor] = None,
+    sliding_window_size: Optional[int] = None,
+    sliding_window_stride: Optional[int] = None,
+    cfg_merge: bool = False,
+    use_gradient_checkpointing: bool = False,
+    use_gradient_checkpointing_offload: bool = False,
+    **kwargs,
+):
+    if sliding_window_size is not None and sliding_window_stride is not None:
+        model_kwargs = dict(
+            dit=dit,
+            motion_controller=motion_controller,
+            vace=vace,
+            latents=latents,
+            timestep=timestep,
+            context=context,
+            clip_feature=clip_feature,
+            y=y,
+            reference_latents=reference_latents,
+            vace_context=vace_context,
+            vace_scale=vace_scale,
+            tea_cache=tea_cache,
+            use_unified_sequence_parallel=use_unified_sequence_parallel,
+            motion_bucket_id=motion_bucket_id,
+        )
+        return TemporalTiler_BCTHW().run(
+            model_fn_wan_video,
+            sliding_window_size,
+            sliding_window_stride,
+            latents.device,
+            latents.dtype,
+            model_kwargs=model_kwargs,
+            tensor_names=["latents", "y"],
+            batch_size=2 if cfg_merge else 1,
+        )
+    if use_unified_sequence_parallel:
+        import torch.distributed as dist
+        from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                             get_sequence_parallel_world_size,
+                                             get_sp_group)
+    # x = latents
+    # print(f"Receving x with shape{x.shape}")
+    # print(f"timesteps {timestep}", end=" ")
+    t = dit.time_embedding(sinusoidal_embedding_1d(dit.freq_dim, timestep))
+    t_mod = dit.time_projection(t).unflatten(1, (6, dit.dim))
+    # print(f"t_mod shape: {t_mod.shape}")
+    # print(f"first ten element{t_mod[0][:10]}")
+    if motion_bucket_id is not None and motion_controller is not None:
+        t_mod = t_mod + \
+            motion_controller(motion_bucket_id).unflatten(1, (6, dit.dim))
+    context = dit.text_embedding(context)
+    # c_b, c_c, c_f, c_h, c_w = x.shape
+    # Merged cfg
+    if latents.shape[0] != context.shape[0]:
+        latents = torch.concat([latents] * context.shape[0], dim=0)
+        # print(f"Merging x to shape {x.shape}")
+    if timestep.shape[0] != context.shape[0]:
+        timestep = torch.concat([timestep] * context.shape[0], dim=0)
+    # import pdb
+    # pdb.set_trace()
+    if dit.has_image_input:
+        latents = torch.cat([latents, y], dim=1)  # (b, c_x + c_y, f, h, w)
+        clip_embdding = dit.img_emb(clip_feature)
+        context = torch.cat([clip_embdding, context], dim=1)
+    latents, (f, h, w) = dit.patchify(latents, None)
+    _shortcut = latents
+    freqs = (
+        torch.cat(
+            [
+                dit.freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+                dit.freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+                dit.freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1),
+            ],
+            dim=-1,
+        )
+        .reshape(f * h * w, 1, -1)
+        .to(latents.device)
+    )
+    if tea_cache is not None:
+        tea_cache_update = tea_cache.check(dit, latents, t_mod)
+    else:
+        tea_cache_update = False
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            latents = torch.chunk(latents, get_sequence_parallel_world_size(), dim=1)[
+                get_sequence_parallel_rank()
+            ]
+    if tea_cache_update:
+        latents = tea_cache.update(latents)
+    else:
+        def create_custom_forward(module):
+            def custom_forward(*inputs, **kwargs):
+                return module(*inputs, **kwargs)
+            return custom_forward
+        for idx, block in enumerate(dit.blocks):
+            if use_gradient_checkpointing_offload:
+                with torch.autograd.graph.save_on_cpu():
+                    latents = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        latents,
+                        context,
+                        t_mod,
+                        freqs,
+                        use_reentrant=False,
+                    )
+            elif use_gradient_checkpointing:
+                latents = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    latents,
+                    context,
+                    t_mod,
+                    freqs,
+                    use_reentrant=False,
+                )
+            else:
+                latents = block(latents, context, t_mod, freqs)
+            if vace_context is not None and idx in vace.vace_layers_mapping:
+                current_vace_hint = vace_hints[vace.vace_layers_mapping[idx]]
+                if (
+                    use_unified_sequence_parallel
+                    and dist.is_initialized()
+                    and dist.get_world_size() > 1
+                ):
+                    current_vace_hint = torch.chunk(
+                        current_vace_hint, get_sequence_parallel_world_size(), dim=1
+                    )[get_sequence_parallel_rank()]
+                latents = latents + current_vace_hint * vace_scale
+        if tea_cache is not None:
+            tea_cache.store(latents)
+    latents = dit.head(latents, t)
+    if use_unified_sequence_parallel:
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            latents = get_sp_group().all_gather(latents, dim=1)
+    # Remove reference latents
+    if reference_latents is not None:
+        latents = latents[:, reference_latents.shape[1]:]
+        f -= 1
+    latents = dit.unpatchify(latents, (f, h, w))
+    return latents

diffsynth/schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .ddim import EnhancedDDIMScheduler
+from .continuous_ode import ContinuousODEScheduler
+from .flow_match import FlowMatchScheduler

diffsynth/schedulers/continuous_ode.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+class ContinuousODEScheduler():
+    def __init__(self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0):
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.rho = rho
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, **kwargs):
+        ramp = torch.linspace(1-denoising_strength, 1, num_inference_steps)
+        min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
+        max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
+        self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho)
+        self.timesteps = torch.log(self.sigmas) * 0.25
+    def step(self, model_output, timestep, sample, to_final=False):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample *= (sigma*sigma + 1).sqrt()
+        estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            prev_sample = estimated_sample
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+            derivative = 1 / sigma * (sample - estimated_sample)
+            prev_sample = sample + derivative * (sigma_ - sigma)
+            prev_sample /= (sigma_*sigma_ + 1).sqrt()
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        # This scheduler doesn't support this function.
+        pass
+    def add_noise(self, original_samples, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (original_samples + noise * sigma) / (sigma*sigma + 1).sqrt()
+        return sample
+    def training_target(self, sample, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        target = (-(sigma*sigma + 1).sqrt() / sigma + 1 / (sigma*sigma + 1).sqrt() / sigma) * sample + 1 / (sigma*sigma + 1).sqrt() * noise
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        weight = (1 + sigma*sigma).sqrt() / sigma
+        return weight

diffsynth/schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch, math
+class EnhancedDDIMScheduler():
+    def __init__(self, num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="epsilon", rescale_zero_terminal_snr=False):
+        self.num_train_timesteps = num_train_timesteps
+        if beta_schedule == "scaled_linear":
+            betas = torch.square(torch.linspace(math.sqrt(beta_start), math.sqrt(beta_end), num_train_timesteps, dtype=torch.float32))
+        elif beta_schedule == "linear":
+            betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented")
+        self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        if rescale_zero_terminal_snr:
+            self.alphas_cumprod = self.rescale_zero_terminal_snr(self.alphas_cumprod)
+        self.alphas_cumprod = self.alphas_cumprod.tolist()
+        self.set_timesteps(10)
+        self.prediction_type = prediction_type
+    def rescale_zero_terminal_snr(self, alphas_cumprod):
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so the last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so the first timestep is back to the old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt.square()  # Revert sqrt
+        return alphas_bar
+    def set_timesteps(self, num_inference_steps, denoising_strength=1.0, **kwargs):
+        # The timesteps are aligned to 999...0, which is different from other implementations,
+        # but I think this implementation is more reasonable in theory.
+        max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0)
+        num_inference_steps = min(num_inference_steps, max_timestep + 1)
+        if num_inference_steps == 1:
+            self.timesteps = torch.Tensor([max_timestep])
+        else:
+            step_length = max_timestep / (num_inference_steps - 1)
+            self.timesteps = torch.Tensor([round(max_timestep - i*step_length) for i in range(num_inference_steps)])
+    def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev):
+        if self.prediction_type == "epsilon":
+            weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t)
+            weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t)
+            prev_sample = sample * weight_x + model_output * weight_e
+        elif self.prediction_type == "v_prediction":
+            weight_e = -math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t)) + math.sqrt(alpha_prod_t * (1 - alpha_prod_t_prev))
+            weight_x = math.sqrt(alpha_prod_t * alpha_prod_t_prev) + math.sqrt((1 - alpha_prod_t) * (1 - alpha_prod_t_prev))
+            prev_sample = sample * weight_x + model_output * weight_e
+        else:
+            raise NotImplementedError(f"{self.prediction_type} is not implemented")
+        return prev_sample
+    def step(self, model_output, timestep, sample, to_final=False):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            alpha_prod_t_prev = 1.0
+        else:
+            timestep_prev = int(self.timesteps[timestep_id + 1])
+            alpha_prod_t_prev = self.alphas_cumprod[timestep_prev]
+        return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev)
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(1 - alpha_prod_t)
+        return noise_pred
+    def add_noise(self, original_samples, noise, timestep):
+        sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+        sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+    def training_target(self, sample, noise, timestep):
+        if self.prediction_type == "epsilon":
+            return noise
+        else:
+            sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+            sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+            target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+            return target
+    def training_weight(self, timestep):
+        return 1.0

diffsynth/schedulers/flow_match.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+class FlowMatchScheduler:
+    def __init__(
+        self,
+        num_inference_steps=100,
+        num_train_timesteps=1000,
+        shift=3.0,
+        sigma_max=1.0,
+        sigma_min=0.003 / 1.002,
+        inverse_timesteps=False,
+        extra_one_step=False,
+        reverse_sigmas=False,
+        training_target='x',
+        training_weight_type='default'
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.training_weight_type = training_weight_type
+        # Initialize basic attributes
+        self.target = None
+        self.timesteps = None
+        self.sigmas = None
+        self.linear_timesteps_weights = None
+        self.training = False
+        self.set_training_target(training_target=training_target)
+        self.set_training_weight(training_weight_type=training_weight_type)
+    def set_training_target(self, training_target='x'):
+        self.target = training_target
+    def set_training_weight(self, training_weight_type):
+        valid_types = ["default", "equal", "early", "late"]
+        assert training_weight_type in valid_types, \
+            f"training_weight_type must be one of {valid_types}"
+        self.training_weight_type = training_weight_type
+    def set_timesteps(
+        self,
+        num_inference_steps=100,  # Kept for signature compatibility if needed
+        denoising_strength=1.0,   # Kept for signature compatibility if needed
+        training=False,
+        shift=None,
+        denoise_step=0.5,
+        **kwargs
+    ):
+        if shift is not None:
+            self.shift = shift
+        self.training = training
+        # As requested: single value calculations
+        # timestep = 1000 * denoise_step
+        # sigma = timestep / 1000  (which simplifies to just denoise_step)
+        # weight = 1.0
+        ts_val = self.num_train_timesteps * denoise_step
+        sigma_val = ts_val / self.num_train_timesteps
+        weight_val = 1.795
+        # Create tensors with a single value
+        self.timesteps = torch.tensor([ts_val], dtype=torch.float32)
+        self.sigmas = torch.tensor([sigma_val], dtype=torch.float32)
+        if self.training:
+            self.linear_timesteps_weights = torch.tensor(
+                [weight_val], dtype=torch.float32)
+        else:
+            self.linear_timesteps_weights = None
+    def step(self, model_output, sample, to_final=False, **kwargs):
+        if self.target == 'x':
+            # print(f"use target x")
+            return model_output
+        elif self.target == 'flow':
+            return sample - model_output
+    def training_target(self, sample, noise, timestep):
+        if self.target == 'x':
+            # print(f"use target x for training")
+            return sample
+        elif self.target == 'flow':
+            target = noise - sample
+            return target
+    def training_weight(self, timestep):
+        # Since linear_timesteps_weights only has one value now,
+        # we can just return it.
+        # (Assuming the logic intends to fetch the unified weight)
+        if self.linear_timesteps_weights is not None:
+            return self.linear_timesteps_weights[0]
+        return 1.0
+if __name__ == "__main__":
+    scheduler = FlowMatchScheduler()
+    scheduler.set_training_weight("default")
+    scheduler.set_timesteps(
+        num_inference_steps=1,
+        training=True,
+        schedule_mode="default",
+        denoise_step=1,
+        shift=5
+    )
+    for step, sigma, weight in zip(scheduler.timesteps, scheduler.sigmas, scheduler.linear_timesteps_weights):
+        print(
+            f"Step: {step.item()}, Sigma: {sigma.item()}, Weight: {weight.item()}")

diffsynth/util/alignment.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Author: Bingxin Ke
+# Last modified: 2024-01-11
+import numpy as np
+import torch
+def align_depth_least_square_video(
+    gt_arr: np.ndarray,
+    pred_arr: np.ndarray,
+    valid_mask_arr: np.ndarray,
+    return_scale_shift=True,
+    max_resolution=None,
+):
+    """
+    gt_arr, pred_arr, valid_mask_arr: shape can be (T, H, W) or (T, 1, H, W)
+    """
+    ori_shape = pred_arr.shape
+    squeeze = lambda x: x.squeeze()  # handle (T,1,H,W) -> (T,H,W)
+    gt = squeeze(gt_arr)
+    pred = squeeze(pred_arr)
+    valid_mask = squeeze(valid_mask_arr)
+    # -----------------------------
+    # Optional downsampling (applied per-frame identically)
+    # -----------------------------
+    if max_resolution is not None:
+        H, W = gt.shape[-2:]
+        scale_factor = np.min(max_resolution / np.array([H, W]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(scale_factor=float(scale_factor), mode="nearest")
+            gt = downscaler(torch.as_tensor(gt).unsqueeze(1)).squeeze(1).numpy()
+            pred = downscaler(torch.as_tensor(pred).unsqueeze(1)).squeeze(1).numpy()
+            valid_mask = (
+                downscaler(torch.as_tensor(valid_mask).unsqueeze(1).float())
+                .squeeze(1).bool().numpy()
+            )
+    assert gt.shape == pred.shape == valid_mask.shape, f"{gt.shape}, {pred.shape}, {valid_mask.shape}"
+    # -----------------------------
+    # Flatten ALL frames
+    # -----------------------------
+    gt_masked = gt[valid_mask].reshape(-1, 1)        # (N, 1)
+    pred_masked = pred[valid_mask].reshape(-1, 1)    # (N, 1)
+    # -----------------------------
+    # Solve least squares over ALL pixels (T*H*W)
+    # -----------------------------
+    _ones = np.ones_like(pred_masked)
+    A = np.concatenate([pred_masked, _ones], axis=-1)   # (N, 2)
+    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
+    scale, shift = X
+    # Apply to original resolution (not the downsampled)
+    aligned_pred = pred_arr * scale + shift
+    aligned_pred = aligned_pred.reshape(ori_shape)
+    if return_scale_shift:
+        return aligned_pred, scale, shift
+    else:
+        return aligned_pred
+def align_depth_least_square(
+    gt_arr: np.ndarray,
+    pred_arr: np.ndarray,
+    valid_mask_arr: np.ndarray,
+    return_scale_shift=True,
+    max_resolution=None,
+):
+    ori_shape = pred_arr.shape  # input shape
+    gt = gt_arr.squeeze()  # [H, W]
+    pred = pred_arr.squeeze()
+    valid_mask = valid_mask_arr.squeeze()
+    # Downsample
+    if max_resolution is not None:
+        scale_factor = np.min(max_resolution / np.array(ori_shape[-2:]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(scale_factor=scale_factor, mode="nearest")
+            gt = downscaler(torch.as_tensor(gt).unsqueeze(0)).numpy()
+            pred = downscaler(torch.as_tensor(pred).unsqueeze(0)).numpy()
+            valid_mask = (
+                downscaler(torch.as_tensor(valid_mask).unsqueeze(0).float())
+                .bool()
+                .numpy()
+            )
+    assert (
+        gt.shape == pred.shape == valid_mask.shape
+    ), f"{gt.shape}, {pred.shape}, {valid_mask.shape}"
+    gt_masked = gt[valid_mask].reshape((-1, 1))
+    pred_masked = pred[valid_mask].reshape((-1, 1))
+    # numpy solver
+    _ones = np.ones_like(pred_masked)
+    A = np.concatenate([pred_masked, _ones], axis=-1)
+    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
+    scale, shift = X
+    aligned_pred = pred_arr * scale + shift
+    # restore dimensions
+    aligned_pred = aligned_pred.reshape(ori_shape)
+    if return_scale_shift:
+        return aligned_pred, scale, shift
+    else:
+        return aligned_pred
+# ******************** disparity space ********************
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+def disparity2depth(disparity, **kwargs):
+    return depth2disparity(disparity, **kwargs)

diffsynth/util/depth_transform.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Author: Bingxin Ke
+# Last modified: 2024-02-08
+import torch
+def get_depth_normalizer(cfg_normalizer):
+    if cfg_normalizer is None:
+        def identical(x):
+            return x
+        depth_transform = identical
+    elif "near_far_metric" == cfg_normalizer.type:
+        depth_transform = NearFarMetricNormalizer(
+            norm_min=cfg_normalizer.norm_min,
+            norm_max=cfg_normalizer.norm_max,
+            min_max_quantile=cfg_normalizer.min_max_quantile,
+            clip=cfg_normalizer.clip,
+        )
+    else:
+        raise NotImplementedError
+    return depth_transform
+class DepthNormalizerBase:
+    is_relative = None
+    far_plane_at_max = None
+    def __init__(
+        self,
+        norm_min=-1.0,
+        norm_max=1.0,
+    ) -> None:
+        self.norm_min = norm_min
+        self.norm_max = norm_max
+        raise NotImplementedError
+    def __call__(self, depth, valid_mask=None, clip=None):
+        raise NotImplementedError
+    def denormalize(self, depth_norm, **kwargs):
+        # For metric depth: convert prediction back to metric depth
+        # For relative depth: convert prediction to [0, 1]
+        raise NotImplementedError
+class NearFarMetricNormalizer(DepthNormalizerBase):
+    """
+    depth in [0, d_max] -> [-1, 1]
+    """
+    is_relative = True
+    far_plane_at_max = True
+    def __init__(
+        self, norm_min=-1.0, norm_max=1.0, min_max_quantile=0.02, clip=True
+    ) -> None:
+        self.norm_min = norm_min
+        self.norm_max = norm_max
+        self.norm_range = self.norm_max - self.norm_min
+        self.min_quantile = min_max_quantile
+        self.max_quantile = 1.0 - self.min_quantile
+        self.clip = clip
+    def __call__(self, depth_linear, valid_mask=None, clip=None):
+        clip = clip if clip is not None else self.clip
+        if valid_mask is None:
+            valid_mask = torch.ones_like(depth_linear).bool()
+        valid_mask = valid_mask & (depth_linear > 0)
+        # Take quantiles as min and max
+        _min, _max = torch.quantile(
+            depth_linear[valid_mask],
+            torch.tensor([self.min_quantile, self.max_quantile]),
+        )
+        # scale and shift
+        depth_norm_linear = (depth_linear - _min) / (
+            _max - _min
+        ) * self.norm_range + self.norm_min
+        if clip:
+            depth_norm_linear = torch.clip(
+                depth_norm_linear, self.norm_min, self.norm_max
+            )
+        return depth_norm_linear
+    def scale_back(self, depth_norm):
+        # scale to [0, 1]
+        depth_linear = (depth_norm - self.norm_min) / self.norm_range
+        return depth_linear
+    def denormalize(self, depth_norm, **kwargs):
+        return self.scale_back(depth_norm=depth_norm)

diffsynth/util/metric.py ADDED Viewed

	@@ -0,0 +1,337 @@

+# Author: Bingxin Ke
+# Last modified: 2024-02-15
+import pandas as pd
+import torch
+# Adapted from: https://github.com/victoresque/pytorch-template/blob/master/utils/util.py
+class MetricTracker:
+    def __init__(self, *keys, writer=None):
+        self.writer = writer
+        self._data = pd.DataFrame(
+            index=keys, columns=["total", "counts", "average"])
+        self.reset()
+    def reset(self):
+        for col in self._data.columns:
+            self._data[col].values[:] = 0
+    def update(self, key, value, n=1):
+        if self.writer is not None:
+            self.writer.add_scalar(key, value)
+        self._data.loc[key, "total"] += value * n
+        self._data.loc[key, "counts"] += n
+        self._data.loc[key, "average"] = self._data.total[key] / \
+            self._data.counts[key]
+    def avg(self, key):
+        return self._data.average[key]
+    def result(self):
+        return dict(self._data.average)
+def pixel_mean(pred, gt, valid_mask):
+    if valid_mask is not None:
+        masked_pred = pred * valid_mask
+        masked_gt = gt * valid_mask
+        valid_pixel_count = torch.sum(valid_mask, dim=(0, 1))
+        pred_mean = torch.sum(masked_pred, dim=(0, 1)) / valid_pixel_count
+        gt_mean = torch.sum(masked_gt, dim=(0, 1)) / valid_pixel_count
+    else:
+        pred_mean = torch.mean(pred, dim=(0, 1))
+        gt_mean = torch.mean(gt, dim=(0, 1))
+    mean_difference = torch.abs(pred_mean - gt_mean)
+    return mean_difference
+def pixel_var(pred, gt, valid_mask):
+    if valid_mask is not None:
+        masked_pred = pred * valid_mask
+        masked_gt = gt * valid_mask
+        valid_pixel_count = torch.sum(valid_mask, dim=(0, 1))
+        pred_mean = torch.sum(masked_pred, dim=(0, 1)) / valid_pixel_count
+        gt_mean = torch.sum(masked_gt, dim=(0, 1)) / valid_pixel_count
+        pred_var = torch.sum(valid_mask * (pred - pred_mean)
+                             ** 2, dim=(0, 1)) / valid_pixel_count
+        gt_var = torch.sum(valid_mask * (gt - gt_mean)**2,
+                           dim=(0, 1)) / valid_pixel_count
+    else:
+        pred_var = torch.var(pred, dim=(0, 1))
+        gt_var = torch.var(gt, dim=(0, 1))
+    var_difference = torch.abs(pred_var - gt_var)
+    return var_difference
+def abs_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    abs_relative_diff = torch.abs(
+        actual_output - actual_target) / actual_target
+    if valid_mask is not None:
+        abs_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    # print(f"total mask: {n}")
+    abs_relative_diff = torch.sum(abs_relative_diff, (-1, -2)) / n
+    # print(f"abs_relative_diff: {abs_relative_diff}")
+    return abs_relative_diff.mean()
+def squared_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    square_relative_diff = (
+        torch.pow(torch.abs(actual_output - actual_target), 2) / actual_target
+    )
+    if valid_mask is not None:
+        square_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    square_relative_diff = torch.sum(square_relative_diff, (-1, -2)) / n
+    return square_relative_diff.mean()
+def rmse_linear(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    diff = actual_output - actual_target
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def rmse_log(output, target, valid_mask=None):
+    diff = torch.log(output) - torch.log(target)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def log10(output, target, valid_mask=None):
+    if valid_mask is not None:
+        diff = torch.abs(
+            torch.log10(output[valid_mask]) - torch.log10(target[valid_mask])
+        )
+    else:
+        diff = torch.abs(torch.log10(output) - torch.log10(target))
+    return diff.mean()
+# adapt from: https://github.com/imran3180/depth-map-prediction/blob/master/main.py
+def threshold_percentage(output, target, threshold_val, valid_mask=None):
+    d1 = output / target
+    d2 = target / output
+    max_d1_d2 = torch.max(d1, d2)
+    zero = torch.zeros(*output.shape)
+    one = torch.ones(*output.shape)
+    bit_mat = torch.where(max_d1_d2.cpu() < threshold_val, one, zero)
+    if valid_mask is not None:
+        bit_mat[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    count_mat = torch.sum(bit_mat, (-1, -2))
+    threshold_mat = count_mat / n.cpu()
+    return threshold_mat.mean()
+def delta1_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25, valid_mask)
+def delta2_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**2, valid_mask)
+def delta3_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**3, valid_mask)
+def i_rmse(output, target, valid_mask=None):
+    output_inv = 1.0 / output
+    target_inv = 1.0 / target
+    diff = output_inv - target_inv
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+def silog_rmse(depth_pred, depth_gt, valid_mask=None):
+    diff = torch.log(depth_pred) - torch.log(depth_gt)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = depth_gt.shape[-2] * depth_gt.shape[-1]
+    diff2 = torch.pow(diff, 2)
+    first_term = torch.sum(diff2, (-1, -2)) / n
+    second_term = torch.pow(torch.sum(diff, (-1, -2)), 2) / (n**2)
+    loss = torch.sqrt(torch.mean(first_term - second_term)) * 100
+    return loss
+def relative_temporal_diff(pred, gt, valid_mask=None, eps=1e-6):
+    """
+    pred, gt: [F, H, W]
+    valid_mask: [F, H, W] (bool)
+    """
+    # relative temporal difference
+    pred_rel = (pred[1:] - pred[:-1]) / (pred[:-1] + eps)  # [F-1, H, W]
+    gt_rel = (gt[1:] - gt[:-1]) / (gt[:-1] + eps)
+    diff = pred_rel - gt_rel
+    if valid_mask is not None:
+        # AND 两帧 mask
+        valid_pair = valid_mask[1:] & valid_mask[:-1]
+        diff[~valid_pair] = 0
+        n = valid_pair.sum((-1, -2))  # [F-1]
+    else:
+        n = diff.shape[-1] * diff.shape[-2]
+    diff2 = diff ** 2
+    # diff1 = torch.abs(diff)
+    # l1 = torch.sum(diff1, (-1, -2)) / (n + eps)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+    # return rmse.mean()
+def boundary_metrics(pred_depth, rgb, valid_mask=None,
+                             th_depth_ratio=1.05, th_rgb_grad=0.15,
+                             tolerance=1, eps=1e-6):
+    import torch
+    import torch.nn.functional as F
+    device = pred_depth.device
+    pred_depth, valid_mask = pred_depth.unsqueeze(1), valid_mask.unsqueeze(1)
+    if rgb.shape[1] == 3:
+        gray = 0.299 * rgb[:, 0:1] + 0.587 * rgb[:, 1:2] + 0.114 * rgb[:, 2:3]
+    else:
+        gray = rgb
+    sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]],
+                           device=device, dtype=rgb.dtype).view(1, 1, 3, 3)
+    sobel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]],
+                           device=device, dtype=rgb.dtype).view(1, 1, 3, 3)
+    gx = F.conv2d(gray, sobel_x, padding=1)
+    gy = F.conv2d(gray, sobel_y, padding=1)
+    mag = torch.sqrt(gx**2 + gy**2 + eps)
+    B = mag.shape[0]
+    mag_flat = mag.view(B, -1)
+    mag_min = mag_flat.min(dim=1, keepdim=True)[0].view(B, 1, 1, 1)
+    mag_max = mag_flat.max(dim=1, keepdim=True)[0].view(B, 1, 1, 1)
+    mag_norm = (mag - mag_min) / (mag_max - mag_min + eps)
+    edges_gt = (mag_norm > th_rgb_grad).float()
+    d = pred_depth.clamp(min=eps)
+    def get_edge_with_nms(ratio_map, dim):
+        is_candidate = ratio_map > th_depth_ratio
+        if dim == 3:
+            k_size, pad = (1, 3), (0, 1)
+        else:
+            k_size, pad = (3, 1), (1, 0)
+        local_max = F.max_pool2d(
+            ratio_map, kernel_size=k_size, stride=1, padding=pad)
+        is_peak = (ratio_map == local_max)
+        return is_candidate & is_peak
+    d_pad = F.pad(d, (1, 1, 1, 1), mode='replicate')  # [B, 1, H+2, W+2]
+    d_center = d
+    # Right: d(x+1, y) / d(x, y)
+    ratio_right = d_pad[:, :, 1:-1, 2:] / d_center
+    mask_right = get_edge_with_nms(ratio_right, dim=3)
+    # Left: d(x-1, y) / d(x, y)
+    ratio_left = d_pad[:, :, 1:-1, :-2] / d_center
+    mask_left = get_edge_with_nms(ratio_left, dim=3)
+    # Bottom: d(x, y+1) / d(x, y)
+    ratio_bottom = d_pad[:, :, 2:, 1:-1] / d_center
+    mask_bottom = get_edge_with_nms(ratio_bottom, dim=2)
+    # Top: d(x, y-1) / d(x, y)
+    ratio_top = d_pad[:, :, :-2, 1:-1] / d_center
+    mask_top = get_edge_with_nms(ratio_top, dim=2)
+    edges_pred = (mask_right | mask_left | mask_bottom | mask_top).float()
+    if valid_mask is not None:
+        edges_gt = edges_gt * valid_mask
+        edges_pred = edges_pred * valid_mask
+    if tolerance > 0:
+        kernel_size = 2 * tolerance + 1
+        edges_gt_dilated = F.max_pool2d(
+            edges_gt, kernel_size=kernel_size, stride=1, padding=tolerance)
+        edges_pred_dilated = F.max_pool2d(
+            edges_pred, kernel_size=kernel_size, stride=1, padding=tolerance)
+    else:
+        edges_gt_dilated = edges_gt
+        edges_pred_dilated = edges_pred
+    # True Positives
+    tp_prec = (edges_pred * edges_gt_dilated).sum()
+    tp_rec = (edges_gt * edges_pred_dilated).sum()
+    # Totals
+    n_pred = edges_pred.sum()
+    n_gt = edges_gt.sum()
+    precision = tp_prec / (n_pred + eps)
+    recall = tp_rec / (n_gt + eps)
+    f1_score = 2 * precision * recall / (precision + recall + eps)
+    return {
+        "f1": f1_score.item(),
+        "precision": precision.item(),
+        "recall": recall.item(),
+        # "edges_pred": edges_pred,
+        # "edges_gt": edges_gt
+    }

diffsynth/util/normal_utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+def get_padding(orig_H, orig_W):
+    """ returns how the input of shape (orig_H, orig_W) should be padded
+        this ensures that both H and W are divisible by 32
+    """
+    if orig_W % 32 == 0:
+        l = 0
+        r = 0
+    else:
+        new_W = 32 * ((orig_W // 32) + 1)
+        l = (new_W - orig_W) // 2
+        r = (new_W - orig_W) - l
+    if orig_H % 32 == 0:
+        t = 0
+        b = 0
+    else:
+        new_H = 32 * ((orig_H // 32) + 1)
+        t = (new_H - orig_H) // 2
+        b = (new_H - orig_H) - t
+    return l, r, t, b
+def pad_input(img, intrins, lrtb=(0,0,0,0)):
+    """ pad input image
+        img should be a torch tensor of shape (B, 3, H, W)
+        intrins should be a torch tensor of shape (B, 3, 3)
+    """
+    l, r, t, b = lrtb
+    if l+r+t+b != 0:
+        pad_value_R = (0 - 0.485) / 0.229
+        pad_value_G = (0 - 0.456) / 0.224
+        pad_value_B = (0 - 0.406) / 0.225
+        img_R = F.pad(img[:,0:1,:,:], (l, r, t, b), mode="constant", value=pad_value_R)
+        img_G = F.pad(img[:,1:2,:,:], (l, r, t, b), mode="constant", value=pad_value_G)
+        img_B = F.pad(img[:,2:3,:,:], (l, r, t, b), mode="constant", value=pad_value_B)
+        img = torch.cat([img_R, img_G, img_B], dim=1)
+        if intrins is not None:
+            intrins[:, 0, 2] += l
+            intrins[:, 1, 2] += t
+    return img, intrins
+def compute_normal_error(pred_norm, gt_norm):
+    """ compute per-pixel surface normal error in degrees
+        NOTE: pred_norm and gt_norm should be torch tensors of shape (B, 3, ...)
+    """
+    pred_error = torch.cosine_similarity(pred_norm, gt_norm, dim=1)
+    pred_error = torch.clamp(pred_error, min=-1.0, max=1.0)
+    pred_error = torch.acos(pred_error) * 180.0 / np.pi
+    pred_error = pred_error.unsqueeze(1)    # (B, 1, ...)
+    return pred_error
+def compute_normal_metrics(total_normal_errors):
+    """ compute surface normal metrics (used for benchmarking)
+        NOTE: total_normal_errors should be a 1D torch tensor of errors in degrees
+    """
+    total_normal_errors = total_normal_errors.detach().cpu().numpy()
+    num_pixels = total_normal_errors.shape[0]
+    metrics = {
+        'mean': np.average(total_normal_errors),
+        'median': np.median(total_normal_errors),
+        'rmse': np.sqrt(np.sum(total_normal_errors * total_normal_errors) / num_pixels),
+        'a1': 100.0 * (np.sum(total_normal_errors < 5) / num_pixels),
+        'a2': 100.0 * (np.sum(total_normal_errors < 7.5) / num_pixels),
+        'a3': 100.0 * (np.sum(total_normal_errors < 11.25) / num_pixels),
+        'a4': 100.0 * (np.sum(total_normal_errors < 22.5) / num_pixels),
+        'a5': 100.0 * (np.sum(total_normal_errors < 30) / num_pixels)
+    }
+    return metrics

diffsynth/util/seed_all.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+import numpy as np
+import random
+import torch
+def seed_all(seed: int = 0):
+    """
+    Set random seeds of all components.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)

diffsynth/vram_management/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .layers import *
2	+ from .gradient_checkpointing import *

diffsynth/vram_management/gradient_checkpointing.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+def create_custom_forward(module):
+    def custom_forward(*inputs, **kwargs):
+        return module(*inputs, **kwargs)
+    return custom_forward
+def gradient_checkpoint_forward(
+    model,
+    use_gradient_checkpointing,
+    use_gradient_checkpointing_offload,
+    *args,
+    **kwargs,
+):
+    if use_gradient_checkpointing_offload:
+        with torch.autograd.graph.save_on_cpu():
+            model_output = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(model),
+                *args,
+                **kwargs,
+                use_reentrant=False,
+            )
+    elif use_gradient_checkpointing:
+        model_output = torch.utils.checkpoint.checkpoint(
+            create_custom_forward(model),
+            *args,
+            **kwargs,
+            use_reentrant=False,
+        )
+    else:
+        model_output = model(*args, **kwargs)
+    return model_output

diffsynth/vram_management/layers.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch, copy
+from ..models.utils import init_weights_on_device
+def cast_to(weight, dtype, device):
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight)
+    return r
+class AutoTorchModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def check_free_vram(self):
+        gpu_mem_state = torch.cuda.mem_get_info(self.computation_device)
+        used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024 ** 3)
+        return used_memory < self.vram_limit
+    def offload(self):
+        if self.state != 0:
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state != 1:
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def keep(self):
+        if self.state != 2:
+            self.to(dtype=self.computation_dtype, device=self.computation_device)
+            self.state = 2
+class AutoWrappedModule(AutoTorchModule):
+    def __init__(self, module: torch.nn.Module, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, **kwargs):
+        super().__init__()
+        self.module = module.to(dtype=offload_dtype, device=offload_device)
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.vram_limit = vram_limit
+        self.state = 0
+    def forward(self, *args, **kwargs):
+        if self.state == 2:
+            module = self.module
+        else:
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                module = self.module
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                module = self.module
+            else:
+                module = copy.deepcopy(self.module).to(dtype=self.computation_dtype, device=self.computation_device)
+        return module(*args, **kwargs)
+class WanAutoCastLayerNorm(torch.nn.LayerNorm, AutoTorchModule):
+    def __init__(self, module: torch.nn.LayerNorm, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, **kwargs):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(module.normalized_shape, eps=module.eps, elementwise_affine=module.elementwise_affine, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.vram_limit = vram_limit
+        self.state = 0
+    def forward(self, x, *args, **kwargs):
+        if self.state == 2:
+            weight, bias = self.weight, self.bias
+        else:
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                weight, bias = self.weight, self.bias
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                weight, bias = self.weight, self.bias
+            else:
+                weight = None if self.weight is None else cast_to(self.weight, self.computation_dtype, self.computation_device)
+                bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+        with torch.amp.autocast(device_type=x.device.type):
+            x = torch.nn.functional.layer_norm(x.float(), self.normalized_shape, weight, bias, self.eps).type_as(x)
+        return x
+class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
+    def __init__(self, module: torch.nn.Linear, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device, vram_limit, name="", **kwargs):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.vram_limit = vram_limit
+        self.state = 0
+        self.name = name
+        self.lora_A_weights = []
+        self.lora_B_weights = []
+        self.lora_merger = None
+    def forward(self, x, *args, **kwargs):
+        if self.state == 2:
+            weight, bias = self.weight, self.bias
+        else:
+            if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+                weight, bias = self.weight, self.bias
+            elif self.vram_limit is not None and self.check_free_vram():
+                self.keep()
+                weight, bias = self.weight, self.bias
+            else:
+                weight = cast_to(self.weight, self.computation_dtype, self.computation_device)
+                bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+        out = torch.nn.functional.linear(x, weight, bias)
+        if len(self.lora_A_weights) == 0:
+            # No LoRA
+            return out
+        elif self.lora_merger is None:
+            # Native LoRA inference
+            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
+                out = out + x @ lora_A.T @ lora_B.T
+        else:
+            # LoRA fusion
+            lora_output = []
+            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
+                lora_output.append(x @ lora_A.T @ lora_B.T)
+            lora_output = torch.stack(lora_output)
+            out = self.lora_merger(out, lora_output)
+        return out
+def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, total_num_param=0, vram_limit=None, name_prefix=""):
+    for name, module in model.named_children():
+        layer_name = name if name_prefix == "" else name_prefix + "." + name
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                num_param = sum(p.numel() for p in module.parameters())
+                if max_num_param is not None and total_num_param + num_param > max_num_param:
+                    module_config_ = overflow_module_config
+                else:
+                    module_config_ = module_config
+                module_ = target_module(module, **module_config_, vram_limit=vram_limit, name=layer_name)
+                setattr(model, name, module_)
+                total_num_param += num_param
+                break
+        else:
+            total_num_param = enable_vram_management_recursively(module, module_map, module_config, max_num_param, overflow_module_config, total_num_param, vram_limit=vram_limit, name_prefix=layer_name)
+    return total_num_param
+def enable_vram_management(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, vram_limit=None):
+    enable_vram_management_recursively(model, module_map, module_config, max_num_param, overflow_module_config, total_num_param=0, vram_limit=vram_limit)
+    model.vram_management_enabled = True

examples/__init__.py ADDED Viewed

File without changes

examples/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .hypersim_dataset import HypersimDataset
+from .video_dataset.kitti_vid_dataset import KITTI_VID_Dataset
+from .video_dataset.nyuv2_dataset import NYUv2Dataset
+from .video_dataset.scannet_dataset import Scannet_VID_Dataset
+from .video_dataset.tartanair_vid_dataset import TartanAir_VID_Dataset
+from .video_dataset.vkitti_vid_dataset import VKITTI_VID_Dataset
+from .vkitti_dataset import VKITTIDataset
+__all__ = [
+    "HypersimDataset",
+    "KITTI_VID_Dataset",
+    "VKITTI_VID_Dataset",
+    "TartanAir_VID_Dataset",
+    "NYUv2Dataset",
+    "VKITTIDataset",
+    'Scannet_VID_Dataset'
+]