Spaces:

AiSudo
/

ZIT-Controlnet

Running on Zero

App Files Files Community

Alexander Bagus commited on 1 day ago

Commit

95ee7de

1 Parent(s): cca610a

22

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

videox_fun/__init__.py +0 -0
videox_fun/api/api.py +0 -226
videox_fun/api/api_multi_nodes.py +0 -320
videox_fun/data/__init__.py +0 -9
videox_fun/data/bucket_sampler.py +0 -379
videox_fun/data/dataset_image.py +0 -191
videox_fun/data/dataset_image_video.py +0 -657
videox_fun/data/dataset_video.py +0 -901
videox_fun/data/utils.py +0 -347
videox_fun/dist/__init__.py +0 -72
videox_fun/dist/cogvideox_xfuser.py +0 -93
videox_fun/dist/flux2_xfuser.py +0 -194
videox_fun/dist/flux_xfuser.py +0 -165
videox_fun/dist/fsdp.py +0 -44
videox_fun/dist/fuser.py +0 -87
videox_fun/dist/hunyuanvideo_xfuser.py +0 -166
videox_fun/dist/qwen_xfuser.py +0 -176
videox_fun/dist/wan_xfuser.py +0 -180
videox_fun/dist/z_image_xfuser.py +0 -85
videox_fun/models/__init__.py +0 -131
videox_fun/models/attention_utils.py +0 -211
videox_fun/models/cache_utils.py +0 -80
videox_fun/models/cogvideox_transformer3d.py +0 -915
videox_fun/models/cogvideox_vae.py +0 -1675
videox_fun/models/fantasytalking_audio_encoder.py +0 -52
videox_fun/models/fantasytalking_transformer3d.py +0 -644
videox_fun/models/flux2_image_processor.py +0 -139
videox_fun/models/flux2_transformer2d.py +0 -1278
videox_fun/models/flux2_transformer2d_control.py +0 -312
videox_fun/models/flux2_vae.py +0 -543
videox_fun/models/flux_transformer2d.py +0 -832
videox_fun/models/hunyuanvideo_transformer3d.py +0 -1478
videox_fun/models/hunyuanvideo_vae.py +0 -1082
videox_fun/models/qwenimage_transformer2d.py +0 -1118
videox_fun/models/qwenimage_vae.py +0 -1087
videox_fun/models/wan_animate_adapter.py +0 -397
videox_fun/models/wan_animate_motion_encoder.py +0 -309
videox_fun/models/wan_audio_encoder.py +0 -213
videox_fun/models/wan_audio_injector.py +0 -1093
videox_fun/models/wan_camera_adapter.py +0 -64
videox_fun/models/wan_image_encoder.py +0 -553
videox_fun/models/wan_text_encoder.py +0 -395
videox_fun/models/wan_transformer3d.py +0 -1394
videox_fun/models/wan_transformer3d_animate.py +0 -302
videox_fun/models/wan_transformer3d_s2v.py +0 -932
videox_fun/models/wan_transformer3d_vace.py +0 -394
videox_fun/models/wan_vae.py +0 -860
videox_fun/models/wan_vae3_8.py +0 -1091
videox_fun/models/wan_xlm_roberta.py +0 -170
videox_fun/models/z_image_transformer2d.py +0 -1050

videox_fun/__init__.py DELETED Viewed

File without changes

videox_fun/api/api.py DELETED Viewed

@@ -1,226 +0,0 @@
-import base64
-import gc
-import hashlib
-import io
-import os
-import tempfile
-from io import BytesIO
-import gradio as gr
-import requests
-import torch
-from fastapi import FastAPI
-from PIL import Image
-# Function to encode a file to Base64
-def encode_file_to_base64(file_path):
-    with open(file_path, "rb") as file:
-        # Encode the data to Base64
-        file_base64 = base64.b64encode(file.read())
-        return file_base64
-def update_diffusion_transformer_api(_: gr.Blocks, app: FastAPI, controller):
-    @app.post("/videox_fun/update_diffusion_transformer")
-    def _update_diffusion_transformer_api(
-        datas: dict,
-    ):
-        diffusion_transformer_path = datas.get('diffusion_transformer_path', 'none')
-        try:
-            controller.update_diffusion_transformer(
-                diffusion_transformer_path
-            )
-            comment = "Success"
-        except Exception as e:
-            torch.cuda.empty_cache()
-            comment = f"Error. error information is {str(e)}"
-        return {"message": comment}
-def download_from_url(url, timeout=10):
-    try:
-        response = requests.get(url, timeout=timeout)
-        response.raise_for_status()  # 检查请求是否成功
-        return response.content
-    except requests.exceptions.RequestException as e:
-        print(f"Error downloading from {url}: {e}")
-        return None
-def save_base64_video(base64_string):
-    video_data = base64.b64decode(base64_string)
-    md5_hash = hashlib.md5(video_data).hexdigest()
-    filename = f"{md5_hash}.mp4"
-    temp_dir = tempfile.gettempdir()
-    file_path = os.path.join(temp_dir, filename)
-    with open(file_path, 'wb') as video_file:
-        video_file.write(video_data)
-    return file_path
-def save_base64_image(base64_string):
-    video_data = base64.b64decode(base64_string)
-    md5_hash = hashlib.md5(video_data).hexdigest()
-    filename = f"{md5_hash}.jpg"
-    temp_dir = tempfile.gettempdir()
-    file_path = os.path.join(temp_dir, filename)
-    with open(file_path, 'wb') as video_file:
-        video_file.write(video_data)
-    return file_path
-def save_url_video(url):
-    video_data = download_from_url(url)
-    if video_data:
-        return save_base64_video(base64.b64encode(video_data))
-    return None
-def save_url_image(url):
-    image_data = download_from_url(url)
-    if image_data:
-        return save_base64_image(base64.b64encode(image_data))
-    return None
-def infer_forward_api(_: gr.Blocks, app: FastAPI, controller):
-    @app.post("/videox_fun/infer_forward")
-    def _infer_forward_api(
-        datas: dict,
-    ):
-        base_model_path = datas.get('base_model_path', 'none')
-        base_model_2_path = datas.get('base_model_2_path', 'none')
-        lora_model_path = datas.get('lora_model_path', 'none')
-        lora_model_2_path = datas.get('lora_model_2_path', 'none')
-        lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
-        prompt_textbox = datas.get('prompt_textbox', None)
-        negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
-        sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
-        sample_step_slider = datas.get('sample_step_slider', 30)
-        resize_method = datas.get('resize_method', "Generate by")
-        width_slider = datas.get('width_slider', 672)
-        height_slider = datas.get('height_slider', 384)
-        base_resolution = datas.get('base_resolution', 512)
-        is_image = datas.get('is_image', False)
-        generation_method = datas.get('generation_method', False)
-        length_slider = datas.get('length_slider', 49)
-        overlap_video_length = datas.get('overlap_video_length', 4)
-        partial_video_length = datas.get('partial_video_length', 72)
-        cfg_scale_slider = datas.get('cfg_scale_slider', 6)
-        start_image = datas.get('start_image', None)
-        end_image = datas.get('end_image', None)
-        validation_video = datas.get('validation_video', None)
-        validation_video_mask = datas.get('validation_video_mask', None)
-        control_video = datas.get('control_video', None)
-        denoise_strength = datas.get('denoise_strength', 0.70)
-        seed_textbox = datas.get("seed_textbox", 43)
-        ref_image = datas.get('ref_image', None)
-        enable_teacache = datas.get('enable_teacache', True)
-        teacache_threshold = datas.get('teacache_threshold', 0.10)
-        num_skip_start_steps = datas.get('num_skip_start_steps', 1)
-        teacache_offload = datas.get('teacache_offload', False)
-        cfg_skip_ratio = datas.get('cfg_skip_ratio', 0)
-        enable_riflex = datas.get('enable_riflex', False)
-        riflex_k = datas.get('riflex_k', 6)
-        fps = datas.get('fps', None)
-        generation_method = "Image Generation" if is_image else generation_method
-        if start_image is not None:
-            if start_image.startswith('http'):
-                start_image = save_url_image(start_image)
-                start_image = [Image.open(start_image).convert("RGB")]
-            else:
-                start_image = base64.b64decode(start_image)
-                start_image = [Image.open(BytesIO(start_image)).convert("RGB")]
-        if end_image is not None:
-            if end_image.startswith('http'):
-                end_image = save_url_image(end_image)
-                end_image = [Image.open(end_image).convert("RGB")]
-            else:
-                end_image = base64.b64decode(end_image)
-                end_image = [Image.open(BytesIO(end_image)).convert("RGB")]
-        if validation_video is not None:
-            if validation_video.startswith('http'):
-                validation_video = save_url_video(validation_video)
-            else:
-                validation_video = save_base64_video(validation_video)
-        if validation_video_mask is not None:
-            if validation_video_mask.startswith('http'):
-                validation_video_mask = save_url_image(validation_video_mask)
-            else:
-                validation_video_mask = save_base64_image(validation_video_mask)
-        if control_video is not None:
-            if control_video.startswith('http'):
-                control_video = save_url_video(control_video)
-            else:
-                control_video = save_base64_video(control_video)
-        if ref_image is not None:
-            if ref_image.startswith('http'):
-                ref_image = save_url_image(ref_image)
-                ref_image = [Image.open(ref_image).convert("RGB")]
-            else:
-                ref_image = base64.b64decode(ref_image)
-                ref_image = [Image.open(BytesIO(ref_image)).convert("RGB")]
-        try:
-            save_sample_path, comment = controller.generate(
-                "",
-                base_model_path,
-                lora_model_path,
-                lora_alpha_slider,
-                prompt_textbox,
-                negative_prompt_textbox,
-                sampler_dropdown,
-                sample_step_slider,
-                resize_method,
-                width_slider,
-                height_slider,
-                base_resolution,
-                generation_method,
-                length_slider,
-                overlap_video_length,
-                partial_video_length,
-                cfg_scale_slider,
-                start_image,
-                end_image,
-                validation_video,
-                validation_video_mask,
-                control_video,
-                denoise_strength,
-                seed_textbox,
-                ref_image = ref_image,
-                enable_teacache = enable_teacache,
-                teacache_threshold = teacache_threshold,
-                num_skip_start_steps = num_skip_start_steps,
-                teacache_offload = teacache_offload,
-                cfg_skip_ratio = cfg_skip_ratio,
-                enable_riflex = enable_riflex,
-                riflex_k = riflex_k,
-                base_model_2_dropdown = base_model_2_path,
-                lora_model_2_dropdown = lora_model_2_path,
-                fps = fps,
-                is_api = True,
-            )
-        except Exception as e:
-            gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-            save_sample_path = ""
-            comment = f"Error. error information is {str(e)}"
-            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-        if save_sample_path != "":
-            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
-        else:
-            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": None}

videox_fun/api/api_multi_nodes.py DELETED Viewed

@@ -1,320 +0,0 @@
-# This file is modified from https://github.com/xdit-project/xDiT/blob/main/entrypoints/launch.py
-import base64
-import gc
-import hashlib
-import io
-import os
-import tempfile
-from io import BytesIO
-import gradio as gr
-import requests
-import torch
-import torch.distributed as dist
-from fastapi import FastAPI, HTTPException
-from PIL import Image
-from .api import download_from_url, encode_file_to_base64
-try:
-    import ray
-except:
-    print("Ray is not installed. If you want to use multi gpus api. Please install it by running 'pip install ray'.")
-    ray =  None
-def save_base64_video_dist(base64_string):
-    video_data = base64.b64decode(base64_string)
-    md5_hash = hashlib.md5(video_data).hexdigest()
-    filename = f"{md5_hash}.mp4"
-    temp_dir = tempfile.gettempdir()
-    file_path = os.path.join(temp_dir, filename)
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            with open(file_path, 'wb') as video_file:
-                video_file.write(video_data)
-        dist.barrier()
-    else:
-        with open(file_path, 'wb') as video_file:
-            video_file.write(video_data)
-    return file_path
-def save_base64_image_dist(base64_string):
-    video_data = base64.b64decode(base64_string)
-    md5_hash = hashlib.md5(video_data).hexdigest()
-    filename = f"{md5_hash}.jpg"
-    temp_dir = tempfile.gettempdir()
-    file_path = os.path.join(temp_dir, filename)
-    if dist.is_initialized():
-        if dist.get_rank() == 0:
-            with open(file_path, 'wb') as video_file:
-                video_file.write(video_data)
-        dist.barrier()
-    else:
-        with open(file_path, 'wb') as video_file:
-            video_file.write(video_data)
-    return file_path
-def save_url_video_dist(url):
-    video_data = download_from_url(url)
-    if video_data:
-        return save_base64_video_dist(base64.b64encode(video_data))
-    return None
-def save_url_image_dist(url):
-    image_data = download_from_url(url)
-    if image_data:
-        return save_base64_image_dist(base64.b64encode(image_data))
-    return None
-if ray is not None:
-    @ray.remote(num_gpus=1)
-    class MultiNodesGenerator:
-        def __init__(
-            self, rank: int, world_size: int, Controller,
-            GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint",
-            config_path=None, ulysses_degree=1, ring_degree=1,
-            fsdp_dit=False, fsdp_text_encoder=False, compile_dit=False,
-            weight_dtype=None, savedir_sample=None,
-        ):
-            # Set PyTorch distributed environment variables
-            os.environ["RANK"] = str(rank)
-            os.environ["WORLD_SIZE"] = str(world_size)
-            os.environ["MASTER_ADDR"] = "127.0.0.1"
-            os.environ["MASTER_PORT"] = "29500"
-            self.rank = rank
-            self.controller = Controller(
-                GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path,
-                ulysses_degree=ulysses_degree, ring_degree=ring_degree,
-                fsdp_dit=fsdp_dit, fsdp_text_encoder=fsdp_text_encoder, compile_dit=compile_dit,
-                weight_dtype=weight_dtype, savedir_sample=savedir_sample,
-            )
-        def generate(self, datas):
-            try:
-                base_model_path = datas.get('base_model_path', 'none')
-                base_model_2_path = datas.get('base_model_2_path', 'none')
-                lora_model_path = datas.get('lora_model_path', 'none')
-                lora_model_2_path = datas.get('lora_model_2_path', 'none')
-                lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
-                prompt_textbox = datas.get('prompt_textbox', None)
-                negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
-                sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
-                sample_step_slider = datas.get('sample_step_slider', 30)
-                resize_method = datas.get('resize_method', "Generate by")
-                width_slider = datas.get('width_slider', 672)
-                height_slider = datas.get('height_slider', 384)
-                base_resolution = datas.get('base_resolution', 512)
-                is_image = datas.get('is_image', False)
-                generation_method = datas.get('generation_method', False)
-                length_slider = datas.get('length_slider', 49)
-                overlap_video_length = datas.get('overlap_video_length', 4)
-                partial_video_length = datas.get('partial_video_length', 72)
-                cfg_scale_slider = datas.get('cfg_scale_slider', 6)
-                start_image = datas.get('start_image', None)
-                end_image = datas.get('end_image', None)
-                validation_video = datas.get('validation_video', None)
-                validation_video_mask = datas.get('validation_video_mask', None)
-                control_video = datas.get('control_video', None)
-                denoise_strength = datas.get('denoise_strength', 0.70)
-                seed_textbox = datas.get("seed_textbox", 43)
-                ref_image = datas.get('ref_image', None)
-                enable_teacache = datas.get('enable_teacache', True)
-                teacache_threshold = datas.get('teacache_threshold', 0.10)
-                num_skip_start_steps = datas.get('num_skip_start_steps', 1)
-                teacache_offload = datas.get('teacache_offload', False)
-                cfg_skip_ratio = datas.get('cfg_skip_ratio', 0)
-                enable_riflex = datas.get('enable_riflex', False)
-                riflex_k = datas.get('riflex_k', 6)
-                fps = datas.get('fps', None)
-                generation_method = "Image Generation" if is_image else generation_method
-                if start_image is not None:
-                    if start_image.startswith('http'):
-                        start_image = save_url_image_dist(start_image)
-                        start_image = [Image.open(start_image).convert("RGB")]
-                    else:
-                        start_image = base64.b64decode(start_image)
-                        start_image = [Image.open(BytesIO(start_image)).convert("RGB")]
-                if end_image is not None:
-                    if end_image.startswith('http'):
-                        end_image = save_url_image_dist(end_image)
-                        end_image = [Image.open(end_image).convert("RGB")]
-                    else:
-                        end_image = base64.b64decode(end_image)
-                        end_image = [Image.open(BytesIO(end_image)).convert("RGB")]
-                if validation_video is not None:
-                    if validation_video.startswith('http'):
-                        validation_video = save_url_video_dist(validation_video)
-                    else:
-                        validation_video = save_base64_video_dist(validation_video)
-                if validation_video_mask is not None:
-                    if validation_video_mask.startswith('http'):
-                        validation_video_mask = save_url_image_dist(validation_video_mask)
-                    else:
-                        validation_video_mask = save_base64_image_dist(validation_video_mask)
-                if control_video is not None:
-                    if control_video.startswith('http'):
-                        control_video = save_url_video_dist(control_video)
-                    else:
-                        control_video = save_base64_video_dist(control_video)
-                if ref_image is not None:
-                    if ref_image.startswith('http'):
-                        ref_image = save_url_image_dist(ref_image)
-                        ref_image = [Image.open(ref_image).convert("RGB")]
-                    else:
-                        ref_image = base64.b64decode(ref_image)
-                        ref_image = [Image.open(BytesIO(ref_image)).convert("RGB")]
-                try:
-                    save_sample_path, comment = self.controller.generate(
-                        "",
-                        base_model_path,
-                        lora_model_path,
-                        lora_alpha_slider,
-                        prompt_textbox,
-                        negative_prompt_textbox,
-                        sampler_dropdown,
-                        sample_step_slider,
-                        resize_method,
-                        width_slider,
-                        height_slider,
-                        base_resolution,
-                        generation_method,
-                        length_slider,
-                        overlap_video_length,
-                        partial_video_length,
-                        cfg_scale_slider,
-                        start_image,
-                        end_image,
-                        validation_video,
-                        validation_video_mask,
-                        control_video,
-                        denoise_strength,
-                        seed_textbox,
-                        ref_image = ref_image,
-                        enable_teacache = enable_teacache,
-                        teacache_threshold = teacache_threshold,
-                        num_skip_start_steps = num_skip_start_steps,
-                        teacache_offload = teacache_offload,
-                        cfg_skip_ratio = cfg_skip_ratio,
-                        enable_riflex = enable_riflex,
-                        riflex_k = riflex_k,
-                        base_model_2_dropdown = base_model_2_path,
-                        lora_model_2_dropdown = lora_model_2_path,
-                        fps = fps,
-                        is_api = True,
-                    )
-                except Exception as e:
-                    gc.collect()
-                    torch.cuda.empty_cache()
-                    torch.cuda.ipc_collect()
-                    save_sample_path = ""
-                    comment = f"Error. error information is {str(e)}"
-                    if dist.is_initialized():
-                        if dist.get_rank() == 0:
-                            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-                        else:
-                            return None
-                    else:
-                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-                if dist.is_initialized():
-                    if dist.get_rank() == 0:
-                        if save_sample_path != "":
-                            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
-                        else:
-                            return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-                    else:
-                        return None
-                else:
-                    if save_sample_path != "":
-                        return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
-                    else:
-                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-            except Exception as e:
-                print(f"Error generating: {str(e)}")
-                comment = f"Error generating: {str(e)}"
-                if dist.is_initialized():
-                    if dist.get_rank() == 0:
-                        return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-                    else:
-                        return None
-                else:
-                    return {"message": comment, "save_sample_path": None, "base64_encoding": None}
-    class MultiNodesEngine:
-        def __init__(
-            self,
-            world_size,
-            Controller,
-            GPU_memory_mode,
-            scheduler_dict,
-            model_name,
-            model_type,
-            config_path,
-            ulysses_degree=1,
-            ring_degree=1,
-            fsdp_dit=False,
-            fsdp_text_encoder=False,
-            compile_dit=False,
-            weight_dtype=torch.bfloat16,
-            savedir_sample="samples"
-        ):
-            # Ensure Ray is initialized
-            if not ray.is_initialized():
-                ray.init()
-            num_workers = world_size
-            self.workers = [
-                MultiNodesGenerator.remote(
-                    rank, world_size, Controller,
-                    GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path,
-                    ulysses_degree=ulysses_degree, ring_degree=ring_degree,
-                    fsdp_dit=fsdp_dit, fsdp_text_encoder=fsdp_text_encoder, compile_dit=compile_dit,
-                    weight_dtype=weight_dtype, savedir_sample=savedir_sample,
-                )
-                for rank in range(num_workers)
-            ]
-            print("Update workers done")
-        async def generate(self, data):
-            results = ray.get([
-                worker.generate.remote(data)
-                for worker in self.workers
-            ])
-            return next(path for path in results if path is not None)
-    def multi_nodes_infer_forward_api(_: gr.Blocks, app: FastAPI, engine):
-        @app.post("/videox_fun/infer_forward")
-        async def _multi_nodes_infer_forward_api(
-            datas: dict,
-        ):
-            try:
-                result = await engine.generate(datas)
-                return result
-            except Exception as e:
-                if isinstance(e, HTTPException):
-                    raise e
-                raise HTTPException(status_code=500, detail=str(e))
-else:
-    MultiNodesEngine = None
-    MultiNodesGenerator = None
-    multi_nodes_infer_forward_api = None

videox_fun/data/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-from .dataset_image import CC15M, ImageEditDataset
-from .dataset_image_video import (ImageVideoControlDataset, ImageVideoDataset, TextDataset,
-                                  ImageVideoSampler)
-from .dataset_video import VideoDataset, VideoSpeechDataset, VideoAnimateDataset, WebVid10M
-from .utils import (VIDEO_READER_TIMEOUT, Camera, VideoReader_contextmanager,
-                    custom_meshgrid, get_random_mask, get_relative_pose,
-                    get_video_reader_batch, padding_image, process_pose_file,
-                    process_pose_params, ray_condition, resize_frame,
-                    resize_image_with_target_area)

videox_fun/data/bucket_sampler.py DELETED Viewed

@@ -1,379 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from typing import (Generic, Iterable, Iterator, List, Optional, Sequence,
-                    Sized, TypeVar, Union)
-import cv2
-import numpy as np
-import torch
-from PIL import Image
-from torch.utils.data import BatchSampler, Dataset, Sampler
-ASPECT_RATIO_512 = {
-    '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
-    '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
-    '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
-    '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
-    '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
-    '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
-    '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
-    '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
-    '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
-    '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
-}
-ASPECT_RATIO_RANDOM_CROP_512 = {
-    '0.42': [320.0, 768.0], '0.5': [352.0, 704.0],
-    '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0],
-    '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0],
-    '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0],
-    '2.0': [704.0, 352.0],  '2.4': [768.0, 320.0]
-}
-ASPECT_RATIO_RANDOM_CROP_PROB = [
-    1, 2,
-    4, 4, 4, 4,
-    8, 8, 8,
-    4, 4, 4, 4,
-    2, 1
-]
-ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
-def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
-    aspect_ratio = height / width
-    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
-    return ratios[closest_ratio], float(closest_ratio)
-def get_image_size_without_loading(path):
-    with Image.open(path) as img:
-        return img.size  # (width, height)
-class RandomSampler(Sampler[int]):
-    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
-    If with replacement, then user can specify :attr:`num_samples` to draw.
-    Args:
-        data_source (Dataset): dataset to sample from
-        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
-        num_samples (int): number of samples to draw, default=`len(dataset)`.
-        generator (Generator): Generator used in sampling.
-    """
-    data_source: Sized
-    replacement: bool
-    def __init__(self, data_source: Sized, replacement: bool = False,
-                 num_samples: Optional[int] = None, generator=None) -> None:
-        self.data_source = data_source
-        self.replacement = replacement
-        self._num_samples = num_samples
-        self.generator = generator
-        self._pos_start = 0
-        if not isinstance(self.replacement, bool):
-            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
-        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
-            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
-    @property
-    def num_samples(self) -> int:
-        # dataset size might change at runtime
-        if self._num_samples is None:
-            return len(self.data_source)
-        return self._num_samples
-    def __iter__(self) -> Iterator[int]:
-        n = len(self.data_source)
-        if self.generator is None:
-            seed = int(torch.empty((), dtype=torch.int64).random_().item())
-            generator = torch.Generator()
-            generator.manual_seed(seed)
-        else:
-            generator = self.generator
-        if self.replacement:
-            for _ in range(self.num_samples // 32):
-                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
-            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
-        else:
-            for _ in range(self.num_samples // n):
-                xx = torch.randperm(n, generator=generator).tolist()
-                if self._pos_start >= n:
-                    self._pos_start = 0
-                print("xx top 10", xx[:10], self._pos_start)
-                for idx in range(self._pos_start, n):
-                    yield xx[idx]
-                    self._pos_start = (self._pos_start + 1) % n
-                self._pos_start = 0
-            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
-    def __len__(self) -> int:
-        return self.num_samples
-class AspectRatioBatchImageSampler(BatchSampler):
-    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
-    Args:
-        sampler (Sampler): Base sampler.
-        dataset (Dataset): Dataset providing data information.
-        batch_size (int): Size of mini-batch.
-        drop_last (bool): If ``True``, the sampler will drop the last batch if
-            its size would be less than ``batch_size``.
-        aspect_ratios (dict): The predefined aspect ratios.
-    """
-    def __init__(
-        self,
-        sampler: Sampler,
-        dataset: Dataset,
-        batch_size: int,
-        train_folder: str = None,
-        aspect_ratios: dict = ASPECT_RATIO_512,
-        drop_last: bool = False,
-        config=None,
-        **kwargs
-    ) -> None:
-        if not isinstance(sampler, Sampler):
-            raise TypeError('sampler should be an instance of ``Sampler``, '
-                            f'but got {sampler}')
-        if not isinstance(batch_size, int) or batch_size <= 0:
-            raise ValueError('batch_size should be a positive integer value, '
-                             f'but got batch_size={batch_size}')
-        self.sampler = sampler
-        self.dataset = dataset
-        self.train_folder = train_folder
-        self.batch_size = batch_size
-        self.aspect_ratios = aspect_ratios
-        self.drop_last = drop_last
-        self.config = config
-        # buckets for each aspect ratio
-        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
-        # [str(k) for k, v in aspect_ratios]
-        self.current_available_bucket_keys = list(aspect_ratios.keys())
-    def __iter__(self):
-        for idx in self.sampler:
-            try:
-                image_dict = self.dataset[idx]
-                width, height = image_dict.get("width", None), image_dict.get("height", None)
-                if width is None or height is None:
-                    image_id, name = image_dict['file_path'], image_dict['text']
-                    if self.train_folder is None:
-                        image_dir = image_id
-                    else:
-                        image_dir = os.path.join(self.train_folder, image_id)
-                    width, height = get_image_size_without_loading(image_dir)
-                    ratio = height / width # self.dataset[idx]
-                else:
-                    height = int(height)
-                    width = int(width)
-                    ratio = height / width # self.dataset[idx]
-            except Exception as e:
-                print(e)
-                continue
-            # find the closest aspect ratio
-            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
-            if closest_ratio not in self.current_available_bucket_keys:
-                continue
-            bucket = self._aspect_ratio_buckets[closest_ratio]
-            bucket.append(idx)
-            # yield a batch of indices in the same aspect ratio group
-            if len(bucket) == self.batch_size:
-                yield bucket[:]
-                del bucket[:]
-class AspectRatioBatchSampler(BatchSampler):
-    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
-    Args:
-        sampler (Sampler): Base sampler.
-        dataset (Dataset): Dataset providing data information.
-        batch_size (int): Size of mini-batch.
-        drop_last (bool): If ``True``, the sampler will drop the last batch if
-            its size would be less than ``batch_size``.
-        aspect_ratios (dict): The predefined aspect ratios.
-    """
-    def __init__(
-        self,
-        sampler: Sampler,
-        dataset: Dataset,
-        batch_size: int,
-        video_folder: str = None,
-        train_data_format: str = "webvid",
-        aspect_ratios: dict = ASPECT_RATIO_512,
-        drop_last: bool = False,
-        config=None,
-        **kwargs
-    ) -> None:
-        if not isinstance(sampler, Sampler):
-            raise TypeError('sampler should be an instance of ``Sampler``, '
-                            f'but got {sampler}')
-        if not isinstance(batch_size, int) or batch_size <= 0:
-            raise ValueError('batch_size should be a positive integer value, '
-                             f'but got batch_size={batch_size}')
-        self.sampler = sampler
-        self.dataset = dataset
-        self.video_folder = video_folder
-        self.train_data_format = train_data_format
-        self.batch_size = batch_size
-        self.aspect_ratios = aspect_ratios
-        self.drop_last = drop_last
-        self.config = config
-        # buckets for each aspect ratio
-        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
-        # [str(k) for k, v in aspect_ratios]
-        self.current_available_bucket_keys = list(aspect_ratios.keys())
-    def __iter__(self):
-        for idx in self.sampler:
-            try:
-                video_dict = self.dataset[idx]
-                width, more = video_dict.get("width", None), video_dict.get("height", None)
-                if width is None or height is None:
-                    if self.train_data_format == "normal":
-                        video_id, name = video_dict['file_path'], video_dict['text']
-                        if self.video_folder is None:
-                            video_dir = video_id
-                        else:
-                            video_dir = os.path.join(self.video_folder, video_id)
-                    else:
-                        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
-                        video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
-                    cap = cv2.VideoCapture(video_dir)
-                    # 获取视频尺寸
-                    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
-                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
-                    ratio = height / width # self.dataset[idx]
-                else:
-                    height = int(height)
-                    width = int(width)
-                    ratio = height / width # self.dataset[idx]
-            except Exception as e:
-                print(e, self.dataset[idx], "This item is error, please check it.")
-                continue
-            # find the closest aspect ratio
-            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
-            if closest_ratio not in self.current_available_bucket_keys:
-                continue
-            bucket = self._aspect_ratio_buckets[closest_ratio]
-            bucket.append(idx)
-            # yield a batch of indices in the same aspect ratio group
-            if len(bucket) == self.batch_size:
-                yield bucket[:]
-                del bucket[:]
-class AspectRatioBatchImageVideoSampler(BatchSampler):
-    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
-    Args:
-        sampler (Sampler): Base sampler.
-        dataset (Dataset): Dataset providing data information.
-        batch_size (int): Size of mini-batch.
-        drop_last (bool): If ``True``, the sampler will drop the last batch if
-            its size would be less than ``batch_size``.
-        aspect_ratios (dict): The predefined aspect ratios.
-    """
-    def __init__(self,
-                 sampler: Sampler,
-                 dataset: Dataset,
-                 batch_size: int,
-                 train_folder: str = None,
-                 aspect_ratios: dict = ASPECT_RATIO_512,
-                 drop_last: bool = False
-                ) -> None:
-        if not isinstance(sampler, Sampler):
-            raise TypeError('sampler should be an instance of ``Sampler``, '
-                            f'but got {sampler}')
-        if not isinstance(batch_size, int) or batch_size <= 0:
-            raise ValueError('batch_size should be a positive integer value, '
-                             f'but got batch_size={batch_size}')
-        self.sampler = sampler
-        self.dataset = dataset
-        self.train_folder = train_folder
-        self.batch_size = batch_size
-        self.aspect_ratios = aspect_ratios
-        self.drop_last = drop_last
-        # buckets for each aspect ratio
-        self.current_available_bucket_keys = list(aspect_ratios.keys())
-        self.bucket = {
-            'image':{ratio: [] for ratio in aspect_ratios},
-            'video':{ratio: [] for ratio in aspect_ratios}
-        }
-    def __iter__(self):
-        for idx in self.sampler:
-            content_type = self.dataset[idx].get('type', 'image')
-            if content_type == 'image':
-                try:
-                    image_dict = self.dataset[idx]
-                    width, height = image_dict.get("width", None), image_dict.get("height", None)
-                    if width is None or height is None:
-                        image_id, name = image_dict['file_path'], image_dict['text']
-                        if self.train_folder is None:
-                            image_dir = image_id
-                        else:
-                            image_dir = os.path.join(self.train_folder, image_id)
-                        width, height = get_image_size_without_loading(image_dir)
-                        ratio = height / width # self.dataset[idx]
-                    else:
-                        height = int(height)
-                        width = int(width)
-                        ratio = height / width # self.dataset[idx]
-                except Exception as e:
-                    print(e, self.dataset[idx], "This item is error, please check it.")
-                    continue
-                # find the closest aspect ratio
-                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
-                if closest_ratio not in self.current_available_bucket_keys:
-                    continue
-                bucket = self.bucket['image'][closest_ratio]
-                bucket.append(idx)
-                # yield a batch of indices in the same aspect ratio group
-                if len(bucket) == self.batch_size:
-                    yield bucket[:]
-                    del bucket[:]
-            else:
-                try:
-                    video_dict = self.dataset[idx]
-                    width, height = video_dict.get("width", None), video_dict.get("height", None)
-                    if width is None or height is None:
-                        video_id, name = video_dict['file_path'], video_dict['text']
-                        if self.train_folder is None:
-                            video_dir = video_id
-                        else:
-                            video_dir = os.path.join(self.train_folder, video_id)
-                        cap = cv2.VideoCapture(video_dir)
-                        # 获取视频尺寸
-                        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
-                        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
-                        ratio = height / width # self.dataset[idx]
-                    else:
-                        height = int(height)
-                        width = int(width)
-                        ratio = height / width # self.dataset[idx]
-                except Exception as e:
-                    print(e, self.dataset[idx], "This item is error, please check it.")
-                    continue
-                # find the closest aspect ratio
-                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
-                if closest_ratio not in self.current_available_bucket_keys:
-                    continue
-                bucket = self.bucket['video'][closest_ratio]
-                bucket.append(idx)
-                # yield a batch of indices in the same aspect ratio group
-                if len(bucket) == self.batch_size:
-                    yield bucket[:]
-                    del bucket[:]

videox_fun/data/dataset_image.py DELETED Viewed

@@ -1,191 +0,0 @@
-import json
-import os
-import random
-import numpy as np
-import torch
-import torchvision.transforms as transforms
-from PIL import Image
-from torch.utils.data.dataset import Dataset
-class CC15M(Dataset):
-    def __init__(
-            self,
-            json_path,
-            video_folder=None,
-            resolution=512,
-            enable_bucket=False,
-        ):
-        print(f"loading annotations from {json_path} ...")
-        self.dataset = json.load(open(json_path, 'r'))
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.enable_bucket = enable_bucket
-        self.video_folder = video_folder
-        resolution = tuple(resolution) if not isinstance(resolution, int) else (resolution, resolution)
-        self.pixel_transforms = transforms.Compose([
-            transforms.Resize(resolution[0]),
-            transforms.CenterCrop(resolution),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-        ])
-    def get_batch(self, idx):
-        video_dict = self.dataset[idx]
-        video_id, name = video_dict['file_path'], video_dict['text']
-        if self.video_folder is None:
-            video_dir = video_id
-        else:
-            video_dir = os.path.join(self.video_folder, video_id)
-        pixel_values = Image.open(video_dir).convert("RGB")
-        return pixel_values, name
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            try:
-                pixel_values, name = self.get_batch(idx)
-                break
-            except Exception as e:
-                print(e)
-                idx = random.randint(0, self.length-1)
-        if not self.enable_bucket:
-            pixel_values = self.pixel_transforms(pixel_values)
-        else:
-            pixel_values = np.array(pixel_values)
-        sample = dict(pixel_values=pixel_values, text=name)
-        return sample
-class ImageEditDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        image_sample_size=512,
-        text_drop_ratio=0.1,
-        enable_bucket=False,
-        enable_inpaint=False,
-        return_file_name=False,
-    ):
-        # Loading annotations from files
-        print(f"loading annotations from {ann_path} ...")
-        if ann_path.endswith('.csv'):
-            with open(ann_path, 'r') as csvfile:
-                dataset = list(csv.DictReader(csvfile))
-        elif ann_path.endswith('.json'):
-            dataset = json.load(open(ann_path))
-        self.data_root = data_root
-        self.dataset = dataset
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        # TODO: enable bucket training
-        self.enable_bucket = enable_bucket
-        self.text_drop_ratio = text_drop_ratio
-        self.enable_inpaint = enable_inpaint
-        self.return_file_name = return_file_name
-        # Image params
-        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
-        self.image_transforms   = transforms.Compose([
-            transforms.Resize(min(self.image_sample_size)),
-            transforms.CenterCrop(self.image_sample_size),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
-        ])
-    def get_batch(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        image_path, text = data_info['file_path'], data_info['text']
-        if self.data_root is not None:
-            image_path = os.path.join(self.data_root, image_path)
-        image = Image.open(image_path).convert('RGB')
-        if not self.enable_bucket:
-            raise ValueError("Not enable_bucket is not supported now. ")
-        else:
-            image = np.expand_dims(np.array(image), 0)
-        source_image_path = data_info.get('source_file_path', [])
-        source_image = []
-        if isinstance(source_image_path, list):
-            for _source_image_path in source_image_path:
-                if self.data_root is not None:
-                    _source_image_path = os.path.join(self.data_root, _source_image_path)
-                _source_image = Image.open(_source_image_path).convert('RGB')
-                source_image.append(_source_image)
-        else:
-            if self.data_root is not None:
-                _source_image_path = os.path.join(self.data_root, source_image_path)
-            _source_image = Image.open(_source_image_path).convert('RGB')
-            source_image.append(_source_image)
-        if not self.enable_bucket:
-            raise ValueError("Not enable_bucket is not supported now. ")
-        else:
-            source_image = [np.array(_source_image) for _source_image in source_image]
-        if random.random() < self.text_drop_ratio:
-            text = ''
-        return image, source_image, text, 'image', image_path
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        data_type = data_info.get('type', 'image')
-        while True:
-            sample = {}
-            try:
-                data_info_local = self.dataset[idx % len(self.dataset)]
-                data_type_local = data_info_local.get('type', 'image')
-                if data_type_local != data_type:
-                    raise ValueError("data_type_local != data_type")
-                pixel_values, source_pixel_values, name, data_type, file_path = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["source_pixel_values"] = source_pixel_values
-                sample["text"] = name
-                sample["data_type"] = data_type
-                sample["idx"] = idx
-                if self.return_file_name:
-                    sample["file_name"] = os.path.basename(file_path)
-                if len(sample) > 0:
-                    break
-            except Exception as e:
-                print(e, self.dataset[idx % len(self.dataset)])
-                idx = random.randint(0, self.length-1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size())
-            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-if __name__ == "__main__":
-    dataset = CC15M(
-        csv_path="./cc15m_add_index.json",
-        resolution=512,
-    )
-    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
-    for idx, batch in enumerate(dataloader):
-        print(batch["pixel_values"].shape, len(batch["text"]))

videox_fun/data/dataset_image_video.py DELETED Viewed

@@ -1,657 +0,0 @@
-import csv
-import gc
-import io
-import json
-import math
-import os
-import random
-from contextlib import contextmanager
-from random import shuffle
-from threading import Thread
-import albumentations
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torchvision.transforms as transforms
-from decord import VideoReader
-from einops import rearrange
-from func_timeout import FunctionTimedOut, func_timeout
-from packaging import version as pver
-from PIL import Image
-from safetensors.torch import load_file
-from torch.utils.data import BatchSampler, Sampler
-from torch.utils.data.dataset import Dataset
-from .utils import (VIDEO_READER_TIMEOUT, Camera, VideoReader_contextmanager,
-                    custom_meshgrid, get_random_mask, get_relative_pose,
-                    get_video_reader_batch, padding_image, process_pose_file,
-                    process_pose_params, ray_condition, resize_frame,
-                    resize_image_with_target_area)
-class ImageVideoSampler(BatchSampler):
-    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
-    Args:
-        sampler (Sampler): Base sampler.
-        dataset (Dataset): Dataset providing data information.
-        batch_size (int): Size of mini-batch.
-        drop_last (bool): If ``True``, the sampler will drop the last batch if
-            its size would be less than ``batch_size``.
-        aspect_ratios (dict): The predefined aspect ratios.
-    """
-    def __init__(self,
-                 sampler: Sampler,
-                 dataset: Dataset,
-                 batch_size: int,
-                 drop_last: bool = False
-                ) -> None:
-        if not isinstance(sampler, Sampler):
-            raise TypeError('sampler should be an instance of ``Sampler``, '
-                            f'but got {sampler}')
-        if not isinstance(batch_size, int) or batch_size <= 0:
-            raise ValueError('batch_size should be a positive integer value, '
-                             f'but got batch_size={batch_size}')
-        self.sampler = sampler
-        self.dataset = dataset
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-        # buckets for each aspect ratio
-        self.bucket = {'image':[], 'video':[]}
-    def __iter__(self):
-        for idx in self.sampler:
-            content_type = self.dataset.dataset[idx].get('type', 'image')
-            self.bucket[content_type].append(idx)
-            # yield a batch of indices in the same aspect ratio group
-            if len(self.bucket['video']) == self.batch_size:
-                bucket = self.bucket['video']
-                yield bucket[:]
-                del bucket[:]
-            elif len(self.bucket['image']) == self.batch_size:
-                bucket = self.bucket['image']
-                yield bucket[:]
-                del bucket[:]
-class ImageVideoDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
-        image_sample_size=512,
-        video_repeat=0,
-        text_drop_ratio=0.1,
-        enable_bucket=False,
-        video_length_drop_start=0.0,
-        video_length_drop_end=1.0,
-        enable_inpaint=False,
-        return_file_name=False,
-    ):
-        # Loading annotations from files
-        print(f"loading annotations from {ann_path} ...")
-        if ann_path.endswith('.csv'):
-            with open(ann_path, 'r') as csvfile:
-                dataset = list(csv.DictReader(csvfile))
-        elif ann_path.endswith('.json'):
-            dataset = json.load(open(ann_path))
-        self.data_root = data_root
-        # It's used to balance num of images and videos.
-        if video_repeat > 0:
-            self.dataset = []
-            for data in dataset:
-                if data.get('type', 'image') != 'video':
-                    self.dataset.append(data)
-            for _ in range(video_repeat):
-                for data in dataset:
-                    if data.get('type', 'image') == 'video':
-                        self.dataset.append(data)
-        else:
-            self.dataset = dataset
-        del dataset
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        # TODO: enable bucket training
-        self.enable_bucket = enable_bucket
-        self.text_drop_ratio = text_drop_ratio
-        self.enable_inpaint = enable_inpaint
-        self.return_file_name = return_file_name
-        self.video_length_drop_start = video_length_drop_start
-        self.video_length_drop_end = video_length_drop_end
-        # Video params
-        self.video_sample_stride    = video_sample_stride
-        self.video_sample_n_frames  = video_sample_n_frames
-        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
-        self.video_transforms = transforms.Compose(
-            [
-                transforms.Resize(min(self.video_sample_size)),
-                transforms.CenterCrop(self.video_sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-        # Image params
-        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
-        self.image_transforms   = transforms.Compose([
-            transforms.Resize(min(self.image_sample_size)),
-            transforms.CenterCrop(self.image_sample_size),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
-        ])
-        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
-    def get_batch(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        if data_info.get('type', 'image')=='video':
-            video_id, text = data_info['file_path'], data_info['text']
-            if self.data_root is None:
-                video_dir = video_id
-            else:
-                video_dir = os.path.join(self.data_root, video_id)
-            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
-                min_sample_n_frames = min(
-                    self.video_sample_n_frames,
-                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
-                )
-                if min_sample_n_frames == 0:
-                    raise ValueError(f"No Frames in video.")
-                video_length = int(self.video_length_drop_end * len(video_reader))
-                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
-                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
-                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
-                try:
-                    sample_args = (video_reader, batch_index)
-                    pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(pixel_values)):
-                        frame = pixel_values[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    pixel_values = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                    pixel_values = pixel_values / 255.
-                    del video_reader
-                else:
-                    pixel_values = pixel_values
-                if not self.enable_bucket:
-                    pixel_values = self.video_transforms(pixel_values)
-                # Random use no text generation
-                if random.random() < self.text_drop_ratio:
-                    text = ''
-            return pixel_values, text, 'video', video_dir
-        else:
-            image_path, text = data_info['file_path'], data_info['text']
-            if self.data_root is not None:
-                image_path = os.path.join(self.data_root, image_path)
-            image = Image.open(image_path).convert('RGB')
-            if not self.enable_bucket:
-                image = self.image_transforms(image).unsqueeze(0)
-            else:
-                image = np.expand_dims(np.array(image), 0)
-            if random.random() < self.text_drop_ratio:
-                text = ''
-            return image, text, 'image', image_path
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        data_type = data_info.get('type', 'image')
-        while True:
-            sample = {}
-            try:
-                data_info_local = self.dataset[idx % len(self.dataset)]
-                data_type_local = data_info_local.get('type', 'image')
-                if data_type_local != data_type:
-                    raise ValueError("data_type_local != data_type")
-                pixel_values, name, data_type, file_path = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["text"] = name
-                sample["data_type"] = data_type
-                sample["idx"] = idx
-                if self.return_file_name:
-                    sample["file_name"] = os.path.basename(file_path)
-                if len(sample) > 0:
-                    break
-            except Exception as e:
-                print(e, self.dataset[idx % len(self.dataset)])
-                idx = random.randint(0, self.length-1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size())
-            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-class ImageVideoControlDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
-        image_sample_size=512,
-        video_repeat=0,
-        text_drop_ratio=0.1,
-        enable_bucket=False,
-        video_length_drop_start=0.1,
-        video_length_drop_end=0.9,
-        enable_inpaint=False,
-        enable_camera_info=False,
-        return_file_name=False,
-        enable_subject_info=False,
-        padding_subject_info=True,
-    ):
-        # Loading annotations from files
-        print(f"loading annotations from {ann_path} ...")
-        if ann_path.endswith('.csv'):
-            with open(ann_path, 'r') as csvfile:
-                dataset = list(csv.DictReader(csvfile))
-        elif ann_path.endswith('.json'):
-            dataset = json.load(open(ann_path))
-        self.data_root = data_root
-        # It's used to balance num of images and videos.
-        if video_repeat > 0:
-            self.dataset = []
-            for data in dataset:
-                if data.get('type', 'image') != 'video':
-                    self.dataset.append(data)
-            for _ in range(video_repeat):
-                for data in dataset:
-                    if data.get('type', 'image') == 'video':
-                        self.dataset.append(data)
-        else:
-            self.dataset = dataset
-        del dataset
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        # TODO: enable bucket training
-        self.enable_bucket = enable_bucket
-        self.text_drop_ratio = text_drop_ratio
-        self.enable_inpaint = enable_inpaint
-        self.enable_camera_info = enable_camera_info
-        self.enable_subject_info = enable_subject_info
-        self.padding_subject_info = padding_subject_info
-        self.video_length_drop_start = video_length_drop_start
-        self.video_length_drop_end = video_length_drop_end
-        # Video params
-        self.video_sample_stride    = video_sample_stride
-        self.video_sample_n_frames  = video_sample_n_frames
-        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
-        self.video_transforms = transforms.Compose(
-            [
-                transforms.Resize(min(self.video_sample_size)),
-                transforms.CenterCrop(self.video_sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-        if self.enable_camera_info:
-            self.video_transforms_camera = transforms.Compose(
-                [
-                    transforms.Resize(min(self.video_sample_size)),
-                    transforms.CenterCrop(self.video_sample_size)
-                ]
-            )
-        # Image params
-        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
-        self.image_transforms   = transforms.Compose([
-            transforms.Resize(min(self.image_sample_size)),
-            transforms.CenterCrop(self.image_sample_size),
-            transforms.ToTensor(),
-            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
-        ])
-        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
-    def get_batch(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        video_id, text = data_info['file_path'], data_info['text']
-        if data_info.get('type', 'image')=='video':
-            if self.data_root is None:
-                video_dir = video_id
-            else:
-                video_dir = os.path.join(self.data_root, video_id)
-            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
-                min_sample_n_frames = min(
-                    self.video_sample_n_frames,
-                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
-                )
-                if min_sample_n_frames == 0:
-                    raise ValueError(f"No Frames in video.")
-                video_length = int(self.video_length_drop_end * len(video_reader))
-                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
-                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
-                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
-                try:
-                    sample_args = (video_reader, batch_index)
-                    pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(pixel_values)):
-                        frame = pixel_values[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    pixel_values = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                    pixel_values = pixel_values / 255.
-                    del video_reader
-                else:
-                    pixel_values = pixel_values
-                if not self.enable_bucket:
-                    pixel_values = self.video_transforms(pixel_values)
-                # Random use no text generation
-                if random.random() < self.text_drop_ratio:
-                    text = ''
-            control_video_id = data_info['control_file_path']
-            if control_video_id is not None:
-                if self.data_root is None:
-                    control_video_id = control_video_id
-                else:
-                    control_video_id = os.path.join(self.data_root, control_video_id)
-            if self.enable_camera_info:
-                if control_video_id.lower().endswith('.txt'):
-                    if not self.enable_bucket:
-                        control_pixel_values = torch.zeros_like(pixel_values)
-                        control_camera_values = process_pose_file(control_video_id, width=self.video_sample_size[1], height=self.video_sample_size[0])
-                        control_camera_values = torch.from_numpy(control_camera_values).permute(0, 3, 1, 2).contiguous()
-                        control_camera_values = F.interpolate(control_camera_values, size=(len(video_reader), control_camera_values.size(3)), mode='bilinear', align_corners=True)
-                        control_camera_values = self.video_transforms_camera(control_camera_values)
-                    else:
-                        control_pixel_values = np.zeros_like(pixel_values)
-                        control_camera_values = process_pose_file(control_video_id, width=self.video_sample_size[1], height=self.video_sample_size[0], return_poses=True)
-                        control_camera_values = torch.from_numpy(np.array(control_camera_values)).unsqueeze(0).unsqueeze(0)
-                        control_camera_values = F.interpolate(control_camera_values, size=(len(video_reader), control_camera_values.size(3)), mode='bilinear', align_corners=True)[0][0]
-                        control_camera_values = np.array([control_camera_values[index] for index in batch_index])
-                else:
-                    if not self.enable_bucket:
-                        control_pixel_values = torch.zeros_like(pixel_values)
-                        control_camera_values = None
-                    else:
-                        control_pixel_values = np.zeros_like(pixel_values)
-                        control_camera_values = None
-            else:
-                if control_video_id is not None:
-                    with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
-                        try:
-                            sample_args = (control_video_reader, batch_index)
-                            control_pixel_values = func_timeout(
-                                VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                            )
-                            resized_frames = []
-                            for i in range(len(control_pixel_values)):
-                                frame = control_pixel_values[i]
-                                resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                                resized_frames.append(resized_frame)
-                            control_pixel_values = np.array(resized_frames)
-                        except FunctionTimedOut:
-                            raise ValueError(f"Read {idx} timeout.")
-                        except Exception as e:
-                            raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                        if not self.enable_bucket:
-                            control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
-                            control_pixel_values = control_pixel_values / 255.
-                            del control_video_reader
-                        else:
-                            control_pixel_values = control_pixel_values
-                        if not self.enable_bucket:
-                            control_pixel_values = self.video_transforms(control_pixel_values)
-                else:
-                    if not self.enable_bucket:
-                        control_pixel_values = torch.zeros_like(pixel_values)
-                    else:
-                        control_pixel_values = np.zeros_like(pixel_values)
-                control_camera_values = None
-            if self.enable_subject_info:
-                if not self.enable_bucket:
-                    visual_height, visual_width = pixel_values.shape[-2:]
-                else:
-                    visual_height, visual_width = pixel_values.shape[1:3]
-                subject_id = data_info.get('object_file_path', [])
-                shuffle(subject_id)
-                subject_images = []
-                for i in range(min(len(subject_id), 4)):
-                    subject_image = Image.open(subject_id[i])
-                    width, height = subject_image.size
-                    total_pixels = width * height
-                    if self.padding_subject_info:
-                        img = padding_image(subject_image, visual_width, visual_height)
-                    else:
-                        img = resize_image_with_target_area(subject_image, 1024 * 1024)
-                    if random.random() < 0.5:
-                        img = img.transpose(Image.FLIP_LEFT_RIGHT)
-                    subject_images.append(np.array(img))
-                if self.padding_subject_info:
-                    subject_image = np.array(subject_images)
-                else:
-                    subject_image = subject_images
-            else:
-                subject_image = None
-            return pixel_values, control_pixel_values, subject_image, control_camera_values, text, "video"
-        else:
-            image_path, text = data_info['file_path'], data_info['text']
-            if self.data_root is not None:
-                image_path = os.path.join(self.data_root, image_path)
-            image = Image.open(image_path).convert('RGB')
-            if not self.enable_bucket:
-                image = self.image_transforms(image).unsqueeze(0)
-            else:
-                image = np.expand_dims(np.array(image), 0)
-            if random.random() < self.text_drop_ratio:
-                text = ''
-            control_image_id = data_info['control_file_path']
-            if self.data_root is None:
-                control_image_id = control_image_id
-            else:
-                control_image_id = os.path.join(self.data_root, control_image_id)
-            control_image = Image.open(control_image_id).convert('RGB')
-            if not self.enable_bucket:
-                control_image = self.image_transforms(control_image).unsqueeze(0)
-            else:
-                control_image = np.expand_dims(np.array(control_image), 0)
-            if self.enable_subject_info:
-                if not self.enable_bucket:
-                    visual_height, visual_width = image.shape[-2:]
-                else:
-                    visual_height, visual_width = image.shape[1:3]
-                subject_id = data_info.get('object_file_path', [])
-                shuffle(subject_id)
-                subject_images = []
-                for i in range(min(len(subject_id), 4)):
-                    subject_image = Image.open(subject_id[i]).convert('RGB')
-                    width, height = subject_image.size
-                    total_pixels = width * height
-                    if self.padding_subject_info:
-                        img = padding_image(subject_image, visual_width, visual_height)
-                    else:
-                        img = resize_image_with_target_area(subject_image, 1024 * 1024)
-                    if random.random() < 0.5:
-                        img = img.transpose(Image.FLIP_LEFT_RIGHT)
-                    subject_images.append(np.array(img))
-                if self.padding_subject_info:
-                    subject_image = np.array(subject_images)
-                else:
-                    subject_image = subject_images
-            else:
-                subject_image = None
-            return image, control_image, subject_image, None, text, 'image'
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        data_type = data_info.get('type', 'image')
-        while True:
-            sample = {}
-            try:
-                data_info_local = self.dataset[idx % len(self.dataset)]
-                data_type_local = data_info_local.get('type', 'image')
-                if data_type_local != data_type:
-                    raise ValueError("data_type_local != data_type")
-                pixel_values, control_pixel_values, subject_image, control_camera_values, name, data_type = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["control_pixel_values"] = control_pixel_values
-                sample["subject_image"] = subject_image
-                sample["text"] = name
-                sample["data_type"] = data_type
-                sample["idx"] = idx
-                if self.enable_camera_info:
-                    sample["control_camera_values"] = control_camera_values
-                if len(sample) > 0:
-                    break
-            except Exception as e:
-                print(e, self.dataset[idx % len(self.dataset)])
-                idx = random.randint(0, self.length-1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size())
-            mask_pixel_values = pixel_values * (1 - mask) + torch.zeros_like(pixel_values) * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-class ImageVideoSafetensorsDataset(Dataset):
-    def __init__(
-        self,
-        ann_path,
-        data_root=None,
-    ):
-        # Loading annotations from files
-        print(f"loading annotations from {ann_path} ...")
-        if ann_path.endswith('.json'):
-            dataset = json.load(open(ann_path))
-        self.data_root = data_root
-        self.dataset = dataset
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        if self.data_root is None:
-            path = self.dataset[idx]["file_path"]
-        else:
-            path = os.path.join(self.data_root, self.dataset[idx]["file_path"])
-        state_dict = load_file(path)
-        return state_dict
-class TextDataset(Dataset):
-    def __init__(self, ann_path, text_drop_ratio=0.0):
-        print(f"loading annotations from {ann_path} ...")
-        with open(ann_path, 'r') as f:
-            self.dataset = json.load(f)
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.text_drop_ratio = text_drop_ratio
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            try:
-                item = self.dataset[idx]
-                text = item['text']
-                # Randomly drop text (for classifier-free guidance)
-                if random.random() < self.text_drop_ratio:
-                    text = ''
-                sample = {
-                    "text": text,
-                    "idx": idx
-                }
-                return sample
-            except Exception as e:
-                print(f"Error at index {idx}: {e}, retrying with random index...")
-                idx = np.random.randint(0, self.length - 1)

videox_fun/data/dataset_video.py DELETED Viewed

@@ -1,901 +0,0 @@
-import csv
-import gc
-import io
-import json
-import math
-import os
-import random
-from contextlib import contextmanager
-from threading import Thread
-import albumentations
-import cv2
-import librosa
-import numpy as np
-import torch
-import torchvision.transforms as transforms
-from decord import VideoReader
-from einops import rearrange
-from func_timeout import FunctionTimedOut, func_timeout
-from PIL import Image
-from torch.utils.data import BatchSampler, Sampler
-from torch.utils.data.dataset import Dataset
-from .utils import (VIDEO_READER_TIMEOUT, Camera, VideoReader_contextmanager,
-                    custom_meshgrid, get_random_mask, get_relative_pose,
-                    get_video_reader_batch, padding_image, process_pose_file,
-                    process_pose_params, ray_condition, resize_frame,
-                    resize_image_with_target_area)
-class WebVid10M(Dataset):
-    def __init__(
-            self,
-            csv_path, video_folder,
-            sample_size=256, sample_stride=4, sample_n_frames=16,
-            enable_bucket=False, enable_inpaint=False, is_image=False,
-        ):
-        print(f"loading annotations from {csv_path} ...")
-        with open(csv_path, 'r') as csvfile:
-            self.dataset = list(csv.DictReader(csvfile))
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.video_folder    = video_folder
-        self.sample_stride   = sample_stride
-        self.sample_n_frames = sample_n_frames
-        self.enable_bucket   = enable_bucket
-        self.enable_inpaint  = enable_inpaint
-        self.is_image        = is_image
-        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
-        self.pixel_transforms = transforms.Compose([
-            transforms.Resize(sample_size[0]),
-            transforms.CenterCrop(sample_size),
-            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-        ])
-    def get_batch(self, idx):
-        video_dict = self.dataset[idx]
-        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
-        video_dir    = os.path.join(self.video_folder, f"{videoid}.mp4")
-        video_reader = VideoReader(video_dir)
-        video_length = len(video_reader)
-        if not self.is_image:
-            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
-            start_idx   = random.randint(0, video_length - clip_length)
-            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
-        else:
-            batch_index = [random.randint(0, video_length - 1)]
-        if not self.enable_bucket:
-            pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
-            pixel_values = pixel_values / 255.
-            del video_reader
-        else:
-            pixel_values = video_reader.get_batch(batch_index).asnumpy()
-        if self.is_image:
-            pixel_values = pixel_values[0]
-        return pixel_values, name
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            try:
-                pixel_values, name = self.get_batch(idx)
-                break
-            except Exception as e:
-                print("Error info:", e)
-                idx = random.randint(0, self.length-1)
-        if not self.enable_bucket:
-            pixel_values = self.pixel_transforms(pixel_values)
-        if self.enable_inpaint:
-            mask = get_random_mask(pixel_values.size())
-            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
-            sample = dict(pixel_values=pixel_values, mask_pixel_values=mask_pixel_values, mask=mask, text=name)
-        else:
-            sample = dict(pixel_values=pixel_values, text=name)
-        return sample
-class VideoDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        sample_size=256, sample_stride=4, sample_n_frames=16,
-        enable_bucket=False, enable_inpaint=False
-    ):
-        print(f"loading annotations from {ann_path} ...")
-        self.dataset = json.load(open(ann_path, 'r'))
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.data_root       = data_root
-        self.sample_stride   = sample_stride
-        self.sample_n_frames = sample_n_frames
-        self.enable_bucket   = enable_bucket
-        self.enable_inpaint  = enable_inpaint
-        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
-        self.pixel_transforms = transforms.Compose(
-            [
-                transforms.Resize(sample_size[0]),
-                transforms.CenterCrop(sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-    def get_batch(self, idx):
-        video_dict = self.dataset[idx]
-        video_id, text = video_dict['file_path'], video_dict['text']
-        if self.data_root is None:
-            video_dir = video_id
-        else:
-            video_dir = os.path.join(self.data_root, video_id)
-        with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
-            min_sample_n_frames = min(
-                self.video_sample_n_frames,
-                int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
-            )
-            if min_sample_n_frames == 0:
-                raise ValueError(f"No Frames in video.")
-            video_length = int(self.video_length_drop_end * len(video_reader))
-            clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
-            start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
-            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
-            try:
-                sample_args = (video_reader, batch_index)
-                pixel_values = func_timeout(
-                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                )
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            if not self.enable_bucket:
-                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                pixel_values = pixel_values / 255.
-                del video_reader
-            else:
-                pixel_values = pixel_values
-            if not self.enable_bucket:
-                pixel_values = self.video_transforms(pixel_values)
-            # Random use no text generation
-            if random.random() < self.text_drop_ratio:
-                text = ''
-            return pixel_values, text
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            sample = {}
-            try:
-                pixel_values, name = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["text"] = name
-                sample["idx"] = idx
-                if len(sample) > 0:
-                    break
-            except Exception as e:
-                print(e, self.dataset[idx % len(self.dataset)])
-                idx = random.randint(0, self.length-1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size())
-            mask_pixel_values = pixel_values * (1 - mask) + torch.zeros_like(pixel_values) * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-class VideoSpeechDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
-        enable_bucket=False, enable_inpaint=False,
-        audio_sr=16000,  # 新增：目标音频采样率
-        text_drop_ratio=0.1  # 新增：文本丢弃概率
-    ):
-        print(f"loading annotations from {ann_path} ...")
-        self.dataset = json.load(open(ann_path, 'r'))
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.data_root = data_root
-        self.video_sample_stride = video_sample_stride
-        self.video_sample_n_frames = video_sample_n_frames
-        self.enable_bucket = enable_bucket
-        self.enable_inpaint = enable_inpaint
-        self.audio_sr = audio_sr
-        self.text_drop_ratio = text_drop_ratio
-        video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
-        self.pixel_transforms = transforms.Compose(
-            [
-                transforms.Resize(video_sample_size[0]),
-                transforms.CenterCrop(video_sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-    def get_batch(self, idx):
-        video_dict = self.dataset[idx]
-        video_id, text = video_dict['file_path'], video_dict['text']
-        audio_id = video_dict['audio_path']
-        if self.data_root is None:
-            video_path = video_id
-        else:
-            video_path = os.path.join(self.data_root, video_id)
-        if self.data_root is None:
-            audio_path = audio_id
-        else:
-            audio_path = os.path.join(self.data_root, audio_id)
-        if not os.path.exists(audio_path):
-            raise FileNotFoundError(f"Audio file not found for {video_path}")
-        with VideoReader_contextmanager(video_path, num_threads=2) as video_reader:
-            total_frames = len(video_reader)
-            fps = video_reader.get_avg_fps()  # 获取原始视频帧率
-            # 计算实际采样的视频帧数（考虑边界）
-            max_possible_frames = (total_frames - 1) // self.video_sample_stride + 1
-            actual_n_frames = min(self.video_sample_n_frames, max_possible_frames)
-            if actual_n_frames <= 0:
-                raise ValueError(f"Video too short: {video_path}")
-            # 随机选择起始帧
-            max_start = total_frames - (actual_n_frames - 1) * self.video_sample_stride - 1
-            start_frame = random.randint(0, max_start) if max_start > 0 else 0
-            frame_indices = [start_frame + i * self.video_sample_stride for i in range(actual_n_frames)]
-            # 读取视频帧
-            try:
-                sample_args = (video_reader, frame_indices)
-                pixel_values = func_timeout(
-                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                )
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            # 视频后处理
-            if not self.enable_bucket:
-                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                pixel_values = pixel_values / 255.
-                pixel_values = self.pixel_transforms(pixel_values)
-            # === 新增：加载并截取对应音频 ===
-            # 视频片段的起止时间（秒）
-            start_time = start_frame / fps
-            end_time = (start_frame + (actual_n_frames - 1) * self.video_sample_stride) / fps
-            duration = end_time - start_time
-            # 使用 librosa 加载整个音频（或仅加载所需部分，但 librosa.load 不支持精确 seek，所以先加载再切）
-            audio_input, sample_rate = librosa.load(audio_path, sr=self.audio_sr)  # 重采样到目标 sr
-            # 转换为样本索引
-            start_sample = int(start_time * self.audio_sr)
-            end_sample = int(end_time * self.audio_sr)
-            # 安全截取
-            if start_sample >= len(audio_input):
-                # 音频太短，用零填充或截断
-                audio_segment = np.zeros(int(duration * self.audio_sr), dtype=np.float32)
-            else:
-                audio_segment = audio_input[start_sample:end_sample]
-                # 如果太短，补零
-                target_len = int(duration * self.audio_sr)
-                if len(audio_segment) < target_len:
-                    audio_segment = np.pad(audio_segment, (0, target_len - len(audio_segment)), mode='constant')
-            # === 文本随机丢弃 ===
-            if random.random() < self.text_drop_ratio:
-                text = ''
-            return pixel_values, text, audio_segment, sample_rate
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            sample = {}
-            try:
-                pixel_values, text, audio, sample_rate = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["text"] = text
-                sample["audio"] = torch.from_numpy(audio).float()  # 转为 tensor
-                sample["sample_rate"] = sample_rate
-                sample["idx"] = idx
-                break
-            except Exception as e:
-                print(f"Error processing {idx}: {e}, retrying with random idx...")
-                idx = random.randint(0, self.length - 1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size(), image_start_only=True)
-            mask_pixel_values = pixel_values * (1 - mask) + torch.zeros_like(pixel_values) * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-class VideoSpeechControlDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
-        enable_bucket=False, enable_inpaint=False,
-        audio_sr=16000,
-        text_drop_ratio=0.1,
-        enable_motion_info=False,
-        motion_frames=73,
-    ):
-        print(f"loading annotations from {ann_path} ...")
-        self.dataset = json.load(open(ann_path, 'r'))
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        self.data_root = data_root
-        self.video_sample_stride = video_sample_stride
-        self.video_sample_n_frames = video_sample_n_frames
-        self.enable_bucket = enable_bucket
-        self.enable_inpaint = enable_inpaint
-        self.audio_sr = audio_sr
-        self.text_drop_ratio = text_drop_ratio
-        self.enable_motion_info = enable_motion_info
-        self.motion_frames = motion_frames
-        video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
-        self.pixel_transforms = transforms.Compose(
-            [
-                transforms.Resize(video_sample_size[0]),
-                transforms.CenterCrop(video_sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-        self.video_sample_size = video_sample_size
-    def get_batch(self, idx):
-        video_dict = self.dataset[idx]
-        video_id, text = video_dict['file_path'], video_dict['text']
-        audio_id = video_dict['audio_path']
-        control_video_id = video_dict['control_file_path']
-        if self.data_root is None:
-            video_path = video_id
-        else:
-            video_path = os.path.join(self.data_root, video_id)
-        if self.data_root is None:
-            audio_path = audio_id
-        else:
-            audio_path = os.path.join(self.data_root, audio_id)
-        if self.data_root is None:
-            control_video_id = control_video_id
-        else:
-            control_video_id = os.path.join(self.data_root, control_video_id)
-        if not os.path.exists(audio_path):
-            raise FileNotFoundError(f"Audio file not found for {video_path}")
-        # Video information
-        with VideoReader_contextmanager(video_path, num_threads=2) as video_reader:
-            total_frames = len(video_reader)
-            fps = video_reader.get_avg_fps()
-            if fps <= 0:
-                raise ValueError(f"Video has negative fps: {video_path}")
-            local_video_sample_stride = self.video_sample_stride
-            new_fps = int(fps // local_video_sample_stride)
-            while new_fps > 30:
-                local_video_sample_stride = local_video_sample_stride + 1
-                new_fps = int(fps // local_video_sample_stride)
-            max_possible_frames = (total_frames - 1) // local_video_sample_stride + 1
-            actual_n_frames = min(self.video_sample_n_frames, max_possible_frames)
-            if actual_n_frames <= 0:
-                raise ValueError(f"Video too short: {video_path}")
-            max_start = total_frames - (actual_n_frames - 1) * local_video_sample_stride - 1
-            start_frame = random.randint(0, max_start) if max_start > 0 else 0
-            frame_indices = [start_frame + i * local_video_sample_stride for i in range(actual_n_frames)]
-            try:
-                sample_args = (video_reader, frame_indices)
-                pixel_values = func_timeout(
-                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                )
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            _, height, width, channel = np.shape(pixel_values)
-            if self.enable_motion_info:
-                motion_pixel_values = np.ones([self.motion_frames, height, width, channel]) * 127.5
-                if start_frame > 0:
-                    motion_max_possible_frames = (start_frame - 1) // local_video_sample_stride + 1
-                    motion_frame_indices = [0 + i * local_video_sample_stride for i in range(motion_max_possible_frames)]
-                    motion_frame_indices = motion_frame_indices[-self.motion_frames:]
-                    _motion_sample_args = (video_reader, motion_frame_indices)
-                    _motion_pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=_motion_sample_args
-                    )
-                    motion_pixel_values[-len(motion_frame_indices):] = _motion_pixel_values
-                if not self.enable_bucket:
-                    motion_pixel_values = torch.from_numpy(motion_pixel_values).permute(0, 3, 1, 2).contiguous()
-                    motion_pixel_values = motion_pixel_values / 255.
-                    motion_pixel_values = self.pixel_transforms(motion_pixel_values)
-            else:
-                motion_pixel_values = None
-            if not self.enable_bucket:
-                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                pixel_values = pixel_values / 255.
-                pixel_values = self.pixel_transforms(pixel_values)
-        # Audio information
-        start_time = start_frame / fps
-        end_time = (start_frame + (actual_n_frames - 1) * local_video_sample_stride) / fps
-        duration = end_time - start_time
-        audio_input, sample_rate = librosa.load(audio_path, sr=self.audio_sr)
-        start_sample = int(start_time * self.audio_sr)
-        end_sample = int(end_time * self.audio_sr)
-        if start_sample >= len(audio_input):
-            raise ValueError(f"Audio file too short: {audio_path}")
-        else:
-            audio_segment = audio_input[start_sample:end_sample]
-            target_len = int(duration * self.audio_sr)
-            if len(audio_segment) < target_len:
-                raise ValueError(f"Audio file too short: {audio_path}")
-        # Control information
-        with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
-            try:
-                sample_args = (control_video_reader, frame_indices)
-                control_pixel_values = func_timeout(
-                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                )
-                resized_frames = []
-                for i in range(len(control_pixel_values)):
-                    frame = control_pixel_values[i]
-                    resized_frame = resize_frame(frame, max(self.video_sample_size))
-                    resized_frames.append(resized_frame)
-                control_pixel_values = np.array(control_pixel_values)
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            if not self.enable_bucket:
-                control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
-                control_pixel_values = control_pixel_values / 255.
-                del control_video_reader
-            else:
-                control_pixel_values = control_pixel_values
-            if not self.enable_bucket:
-                control_pixel_values = self.video_transforms(control_pixel_values)
-        if random.random() < self.text_drop_ratio:
-            text = ''
-        return pixel_values, motion_pixel_values, control_pixel_values, text, audio_segment, sample_rate, new_fps
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        while True:
-            sample = {}
-            try:
-                pixel_values, motion_pixel_values, control_pixel_values, text, audio, sample_rate, new_fps = self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["motion_pixel_values"] = motion_pixel_values
-                sample["control_pixel_values"] = control_pixel_values
-                sample["text"] = text
-                sample["audio"] = torch.from_numpy(audio).float()  # 转为 tensor
-                sample["sample_rate"] = sample_rate
-                sample["fps"] = new_fps
-                sample["idx"] = idx
-                break
-            except Exception as e:
-                print(f"Error processing {idx}: {e}, retrying with random idx...")
-                idx = random.randint(0, self.length - 1)
-        if self.enable_inpaint and not self.enable_bucket:
-            mask = get_random_mask(pixel_values.size(), image_start_only=True)
-            mask_pixel_values = pixel_values * (1 - mask) + torch.zeros_like(pixel_values) * mask
-            sample["mask_pixel_values"] = mask_pixel_values
-            sample["mask"] = mask
-            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
-            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
-            sample["clip_pixel_values"] = clip_pixel_values
-        return sample
-class VideoAnimateDataset(Dataset):
-    def __init__(
-        self,
-        ann_path, data_root=None,
-        video_sample_size=512,
-        video_sample_stride=4,
-        video_sample_n_frames=16,
-        video_repeat=0,
-        text_drop_ratio=0.1,
-        enable_bucket=False,
-        video_length_drop_start=0.1,
-        video_length_drop_end=0.9,
-        return_file_name=False,
-    ):
-        # Loading annotations from files
-        print(f"loading annotations from {ann_path} ...")
-        if ann_path.endswith('.csv'):
-            with open(ann_path, 'r') as csvfile:
-                dataset = list(csv.DictReader(csvfile))
-        elif ann_path.endswith('.json'):
-            dataset = json.load(open(ann_path))
-        self.data_root = data_root
-        # It's used to balance num of images and videos.
-        if video_repeat > 0:
-            self.dataset = []
-            for data in dataset:
-                if data.get('type', 'image') != 'video':
-                    self.dataset.append(data)
-            for _ in range(video_repeat):
-                for data in dataset:
-                    if data.get('type', 'image') == 'video':
-                        self.dataset.append(data)
-        else:
-            self.dataset = dataset
-        del dataset
-        self.length = len(self.dataset)
-        print(f"data scale: {self.length}")
-        # TODO: enable bucket training
-        self.enable_bucket = enable_bucket
-        self.text_drop_ratio = text_drop_ratio
-        self.video_length_drop_start = video_length_drop_start
-        self.video_length_drop_end = video_length_drop_end
-        # Video params
-        self.video_sample_stride    = video_sample_stride
-        self.video_sample_n_frames  = video_sample_n_frames
-        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
-        self.video_transforms = transforms.Compose(
-            [
-                transforms.Resize(min(self.video_sample_size)),
-                transforms.CenterCrop(self.video_sample_size),
-                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ]
-        )
-        self.larger_side_of_image_and_video = min(self.video_sample_size)
-    def get_batch(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        video_id, text = data_info['file_path'], data_info['text']
-        if self.data_root is None:
-            video_dir = video_id
-        else:
-            video_dir = os.path.join(self.data_root, video_id)
-        with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
-            min_sample_n_frames = min(
-                self.video_sample_n_frames,
-                int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
-            )
-            if min_sample_n_frames == 0:
-                raise ValueError(f"No Frames in video.")
-            video_length = int(self.video_length_drop_end * len(video_reader))
-            clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
-            start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
-            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
-            try:
-                sample_args = (video_reader, batch_index)
-                pixel_values = func_timeout(
-                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                )
-                resized_frames = []
-                for i in range(len(pixel_values)):
-                    frame = pixel_values[i]
-                    resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                    resized_frames.append(resized_frame)
-                pixel_values = np.array(resized_frames)
-            except FunctionTimedOut:
-                raise ValueError(f"Read {idx} timeout.")
-            except Exception as e:
-                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-            if not self.enable_bucket:
-                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
-                pixel_values = pixel_values / 255.
-                del video_reader
-            else:
-                pixel_values = pixel_values
-            if not self.enable_bucket:
-                pixel_values = self.video_transforms(pixel_values)
-            # Random use no text generation
-            if random.random() < self.text_drop_ratio:
-                text = ''
-        control_video_id = data_info['control_file_path']
-        if control_video_id is not None:
-            if self.data_root is None:
-                control_video_id = control_video_id
-            else:
-                control_video_id = os.path.join(self.data_root, control_video_id)
-        if control_video_id is not None:
-            with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
-                try:
-                    sample_args = (control_video_reader, batch_index)
-                    control_pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(control_pixel_values)):
-                        frame = control_pixel_values[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    control_pixel_values = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
-                    control_pixel_values = control_pixel_values / 255.
-                    del control_video_reader
-                else:
-                    control_pixel_values = control_pixel_values
-                if not self.enable_bucket:
-                    control_pixel_values = self.video_transforms(control_pixel_values)
-        else:
-            if not self.enable_bucket:
-                control_pixel_values = torch.zeros_like(pixel_values)
-            else:
-                control_pixel_values = np.zeros_like(pixel_values)
-        face_video_id = data_info['face_file_path']
-        if face_video_id is not None:
-            if self.data_root is None:
-                face_video_id = face_video_id
-            else:
-                face_video_id = os.path.join(self.data_root, face_video_id)
-        if face_video_id is not None:
-            with VideoReader_contextmanager(face_video_id, num_threads=2) as face_video_reader:
-                try:
-                    sample_args = (face_video_reader, batch_index)
-                    face_pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(face_pixel_values)):
-                        frame = face_pixel_values[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    face_pixel_values = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    face_pixel_values = torch.from_numpy(face_pixel_values).permute(0, 3, 1, 2).contiguous()
-                    face_pixel_values = face_pixel_values / 255.
-                    del face_video_reader
-                else:
-                    face_pixel_values = face_pixel_values
-                if not self.enable_bucket:
-                    face_pixel_values = self.video_transforms(face_pixel_values)
-        else:
-            if not self.enable_bucket:
-                face_pixel_values = torch.zeros_like(pixel_values)
-            else:
-                face_pixel_values = np.zeros_like(pixel_values)
-        background_video_id = data_info.get('background_file_path', None)
-        if background_video_id is not None:
-            if self.data_root is None:
-                background_video_id = background_video_id
-            else:
-                background_video_id = os.path.join(self.data_root, background_video_id)
-        if background_video_id is not None:
-            with VideoReader_contextmanager(background_video_id, num_threads=2) as background_video_reader:
-                try:
-                    sample_args = (background_video_reader, batch_index)
-                    background_pixel_values = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(background_pixel_values)):
-                        frame = background_pixel_values[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    background_pixel_values = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    background_pixel_values = torch.from_numpy(background_pixel_values).permute(0, 3, 1, 2).contiguous()
-                    background_pixel_values = background_pixel_values / 255.
-                    del background_video_reader
-                else:
-                    background_pixel_values = background_pixel_values
-                if not self.enable_bucket:
-                    background_pixel_values = self.video_transforms(background_pixel_values)
-        else:
-            if not self.enable_bucket:
-                background_pixel_values = torch.ones_like(pixel_values) * 127.5
-            else:
-                background_pixel_values = np.ones_like(pixel_values) * 127.5
-        mask_video_id = data_info.get('mask_file_path', None)
-        if mask_video_id is not None:
-            if self.data_root is None:
-                mask_video_id = mask_video_id
-            else:
-                mask_video_id = os.path.join(self.data_root, mask_video_id)
-        if mask_video_id is not None:
-            with VideoReader_contextmanager(mask_video_id, num_threads=2) as mask_video_reader:
-                try:
-                    sample_args = (mask_video_reader, batch_index)
-                    mask = func_timeout(
-                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
-                    )
-                    resized_frames = []
-                    for i in range(len(mask)):
-                        frame = mask[i]
-                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
-                        resized_frames.append(resized_frame)
-                    mask = np.array(resized_frames)
-                except FunctionTimedOut:
-                    raise ValueError(f"Read {idx} timeout.")
-                except Exception as e:
-                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
-                if not self.enable_bucket:
-                    mask = torch.from_numpy(mask).permute(0, 3, 1, 2).contiguous()
-                    mask = mask / 255.
-                    del mask_video_reader
-                else:
-                    mask = mask
-        else:
-            if not self.enable_bucket:
-                mask = torch.ones_like(pixel_values)
-            else:
-                mask = np.ones_like(pixel_values) * 255
-        mask = mask[:, :, :, :1]
-        ref_pixel_values_path = data_info.get('ref_file_path', [])
-        if self.data_root is not None:
-            ref_pixel_values_path = os.path.join(self.data_root, ref_pixel_values_path)
-        ref_pixel_values = Image.open(ref_pixel_values_path).convert('RGB')
-        if not self.enable_bucket:
-            raise ValueError("Not enable_bucket is not supported now. ")
-        else:
-            ref_pixel_values = np.array(ref_pixel_values)
-        return pixel_values, control_pixel_values, face_pixel_values, background_pixel_values, mask, ref_pixel_values, text, "video"
-    def __len__(self):
-        return self.length
-    def __getitem__(self, idx):
-        data_info = self.dataset[idx % len(self.dataset)]
-        data_type = data_info.get('type', 'image')
-        while True:
-            sample = {}
-            try:
-                data_info_local = self.dataset[idx % len(self.dataset)]
-                data_type_local = data_info_local.get('type', 'image')
-                if data_type_local != data_type:
-                    raise ValueError("data_type_local != data_type")
-                pixel_values, control_pixel_values, face_pixel_values, background_pixel_values, mask, ref_pixel_values, name, data_type = \
-                    self.get_batch(idx)
-                sample["pixel_values"] = pixel_values
-                sample["control_pixel_values"] = control_pixel_values
-                sample["face_pixel_values"] = face_pixel_values
-                sample["background_pixel_values"] = background_pixel_values
-                sample["mask"] = mask
-                sample["ref_pixel_values"] = ref_pixel_values
-                sample["clip_pixel_values"] = ref_pixel_values
-                sample["text"] = name
-                sample["data_type"] = data_type
-                sample["idx"] = idx
-                if len(sample) > 0:
-                    break
-            except Exception as e:
-                print(e, self.dataset[idx % len(self.dataset)])
-                idx = random.randint(0, self.length-1)
-        return sample
-if __name__ == "__main__":
-    if 1:
-        dataset = VideoDataset(
-            json_path="./webvidval/results_2M_val.json",
-            sample_size=256,
-            sample_stride=4, sample_n_frames=16,
-        )
-    if 0:
-        dataset = WebVid10M(
-            csv_path="./webvid/results_2M_val.csv",
-            video_folder="./webvid/2M_val",
-            sample_size=256,
-            sample_stride=4, sample_n_frames=16,
-            is_image=False,
-        )
-    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
-    for idx, batch in enumerate(dataloader):
-        print(batch["pixel_values"].shape, len(batch["text"]))

videox_fun/data/utils.py DELETED Viewed

@@ -1,347 +0,0 @@
-import csv
-import gc
-import io
-import json
-import math
-import os
-import random
-from contextlib import contextmanager
-from random import shuffle
-from threading import Thread
-import albumentations
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torchvision.transforms as transforms
-from decord import VideoReader
-from einops import rearrange
-from func_timeout import FunctionTimedOut, func_timeout
-from packaging import version as pver
-from PIL import Image
-from safetensors.torch import load_file
-from torch.utils.data import BatchSampler, Sampler
-from torch.utils.data.dataset import Dataset
-VIDEO_READER_TIMEOUT = 20
-def get_random_mask(shape, image_start_only=False):
-    f, c, h, w = shape
-    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
-    if not image_start_only:
-        if f != 1:
-            mask_index = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], p=[0.05, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05])
-        else:
-            mask_index = np.random.choice([0, 1, 7, 8], p = [0.2, 0.7, 0.05, 0.05])
-        if mask_index == 0:
-            center_x = torch.randint(0, w, (1,)).item()
-            center_y = torch.randint(0, h, (1,)).item()
-            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
-            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
-            start_x = max(center_x - block_size_x // 2, 0)
-            end_x = min(center_x + block_size_x // 2, w)
-            start_y = max(center_y - block_size_y // 2, 0)
-            end_y = min(center_y + block_size_y // 2, h)
-            mask[:, :, start_y:end_y, start_x:end_x] = 1
-        elif mask_index == 1:
-            mask[:, :, :, :] = 1
-        elif mask_index == 2:
-            mask_frame_index = np.random.randint(1, 5)
-            mask[mask_frame_index:, :, :, :] = 1
-        elif mask_index == 3:
-            mask_frame_index = np.random.randint(1, 5)
-            mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
-        elif mask_index == 4:
-            center_x = torch.randint(0, w, (1,)).item()
-            center_y = torch.randint(0, h, (1,)).item()
-            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
-            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
-            start_x = max(center_x - block_size_x // 2, 0)
-            end_x = min(center_x + block_size_x // 2, w)
-            start_y = max(center_y - block_size_y // 2, 0)
-            end_y = min(center_y + block_size_y // 2, h)
-            mask_frame_before = np.random.randint(0, f // 2)
-            mask_frame_after = np.random.randint(f // 2, f)
-            mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
-        elif mask_index == 5:
-            mask = torch.randint(0, 2, (f, 1, h, w), dtype=torch.uint8)
-        elif mask_index == 6:
-            num_frames_to_mask = random.randint(1, max(f // 2, 1))
-            frames_to_mask = random.sample(range(f), num_frames_to_mask)
-            for i in frames_to_mask:
-                block_height = random.randint(1, h // 4)
-                block_width = random.randint(1, w // 4)
-                top_left_y = random.randint(0, h - block_height)
-                top_left_x = random.randint(0, w - block_width)
-                mask[i, 0, top_left_y:top_left_y + block_height, top_left_x:top_left_x + block_width] = 1
-        elif mask_index == 7:
-            center_x = torch.randint(0, w, (1,)).item()
-            center_y = torch.randint(0, h, (1,)).item()
-            a = torch.randint(min(w, h) // 8, min(w, h) // 4, (1,)).item()  # 长半轴
-            b = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()  # 短半轴
-            for i in range(h):
-                for j in range(w):
-                    if ((i - center_y) ** 2) / (b ** 2) + ((j - center_x) ** 2) / (a ** 2) < 1:
-                        mask[:, :, i, j] = 1
-        elif mask_index == 8:
-            center_x = torch.randint(0, w, (1,)).item()
-            center_y = torch.randint(0, h, (1,)).item()
-            radius = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()
-            for i in range(h):
-                for j in range(w):
-                    if (i - center_y) ** 2 + (j - center_x) ** 2 < radius ** 2:
-                        mask[:, :, i, j] = 1
-        elif mask_index == 9:
-            for idx in range(f):
-                if np.random.rand() > 0.5:
-                    mask[idx, :, :, :] = 1
-        else:
-            raise ValueError(f"The mask_index {mask_index} is not define")
-    else:
-        if f != 1:
-            mask[1:, :, :, :] = 1
-        else:
-            mask[:, :, :, :] = 1
-    return mask
-@contextmanager
-def VideoReader_contextmanager(*args, **kwargs):
-    vr = VideoReader(*args, **kwargs)
-    try:
-        yield vr
-    finally:
-        del vr
-        gc.collect()
-def get_video_reader_batch(video_reader, batch_index):
-    frames = video_reader.get_batch(batch_index).asnumpy()
-    return frames
-def resize_frame(frame, target_short_side):
-    h, w, _ = frame.shape
-    if h < w:
-        if target_short_side > h:
-            return frame
-        new_h = target_short_side
-        new_w = int(target_short_side * w / h)
-    else:
-        if target_short_side > w:
-            return frame
-        new_w = target_short_side
-        new_h = int(target_short_side * h / w)
-    resized_frame = cv2.resize(frame, (new_w, new_h))
-    return resized_frame
-def padding_image(images, new_width, new_height):
-    new_image = Image.new('RGB', (new_width, new_height), (255, 255, 255))
-    aspect_ratio = images.width / images.height
-    if new_width / new_height > 1:
-        if aspect_ratio > new_width / new_height:
-            new_img_width = new_width
-            new_img_height = int(new_img_width / aspect_ratio)
-        else:
-            new_img_height = new_height
-            new_img_width = int(new_img_height * aspect_ratio)
-    else:
-        if aspect_ratio > new_width / new_height:
-            new_img_width = new_width
-            new_img_height = int(new_img_width / aspect_ratio)
-        else:
-            new_img_height = new_height
-            new_img_width = int(new_img_height * aspect_ratio)
-    resized_img = images.resize((new_img_width, new_img_height))
-    paste_x = (new_width - new_img_width) // 2
-    paste_y = (new_height - new_img_height) // 2
-    new_image.paste(resized_img, (paste_x, paste_y))
-    return new_image
-def resize_image_with_target_area(img: Image.Image, target_area: int = 1024 * 1024) -> Image.Image:
-    """
-    将 PIL 图像缩放到接近指定像素面积（target_area），保持原始宽高比，
-    并确保新宽度和高度均为 32 的整数倍。
-    参数:
-        img (PIL.Image.Image): 输入图像
-        target_area (int): 目标像素总面积，例如 1024*1024 = 1048576
-    返回:
-        PIL.Image.Image: Resize 后的图像
-    """
-    orig_w, orig_h = img.size
-    if orig_w == 0 or orig_h == 0:
-        raise ValueError("Input image has zero width or height.")
-    ratio = orig_w / orig_h
-    ideal_width = math.sqrt(target_area * ratio)
-    ideal_height = ideal_width / ratio
-    new_width = round(ideal_width / 32) * 32
-    new_height = round(ideal_height / 32) * 32
-    new_width = max(32, new_width)
-    new_height = max(32, new_height)
-    new_width = int(new_width)
-    new_height = int(new_height)
-    resized_img = img.resize((new_width, new_height), Image.LANCZOS)
-    return resized_img
-class Camera(object):
-    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    def __init__(self, entry):
-        fx, fy, cx, cy = entry[1:5]
-        self.fx = fx
-        self.fy = fy
-        self.cx = cx
-        self.cy = cy
-        w2c_mat = np.array(entry[7:]).reshape(3, 4)
-        w2c_mat_4x4 = np.eye(4)
-        w2c_mat_4x4[:3, :] = w2c_mat
-        self.w2c_mat = w2c_mat_4x4
-        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
-def custom_meshgrid(*args):
-    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
-    if pver.parse(torch.__version__) < pver.parse('1.10'):
-        return torch.meshgrid(*args)
-    else:
-        return torch.meshgrid(*args, indexing='ij')
-def get_relative_pose(cam_params):
-    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
-    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
-    cam_to_origin = 0
-    target_cam_c2w = np.array([
-        [1, 0, 0, 0],
-        [0, 1, 0, -cam_to_origin],
-        [0, 0, 1, 0],
-        [0, 0, 0, 1]
-    ])
-    abs2rel = target_cam_c2w @ abs_w2cs[0]
-    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
-    ret_poses = np.array(ret_poses, dtype=np.float32)
-    return ret_poses
-def ray_condition(K, c2w, H, W, device):
-    """Copied from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    # c2w: B, V, 4, 4
-    # K: B, V, 4
-    B = K.shape[0]
-    j, i = custom_meshgrid(
-        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
-        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
-    )
-    i = i.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
-    j = j.reshape([1, 1, H * W]).expand([B, 1, H * W]) + 0.5  # [B, HxW]
-    fx, fy, cx, cy = K.chunk(4, dim=-1)  # B,V, 1
-    zs = torch.ones_like(i)  # [B, HxW]
-    xs = (i - cx) / fx * zs
-    ys = (j - cy) / fy * zs
-    zs = zs.expand_as(ys)
-    directions = torch.stack((xs, ys, zs), dim=-1)  # B, V, HW, 3
-    directions = directions / directions.norm(dim=-1, keepdim=True)  # B, V, HW, 3
-    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)  # B, V, 3, HW
-    rays_o = c2w[..., :3, 3]  # B, V, 3
-    rays_o = rays_o[:, :, None].expand_as(rays_d)  # B, V, 3, HW
-    # c2w @ dirctions
-    rays_dxo = torch.cross(rays_o, rays_d)
-    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
-    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)  # B, V, H, W, 6
-    # plucker = plucker.permute(0, 1, 4, 2, 3)
-    return plucker
-def process_pose_file(pose_file_path, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu', return_poses=False):
-    """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    with open(pose_file_path, 'r') as f:
-        poses = f.readlines()
-    poses = [pose.strip().split(' ') for pose in poses[1:]]
-    cam_params = [[float(x) for x in pose] for pose in poses]
-    if return_poses:
-        return cam_params
-    else:
-        cam_params = [Camera(cam_param) for cam_param in cam_params]
-        sample_wh_ratio = width / height
-        pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
-        if pose_wh_ratio > sample_wh_ratio:
-            resized_ori_w = height * pose_wh_ratio
-            for cam_param in cam_params:
-                cam_param.fx = resized_ori_w * cam_param.fx / width
-        else:
-            resized_ori_h = width / pose_wh_ratio
-            for cam_param in cam_params:
-                cam_param.fy = resized_ori_h * cam_param.fy / height
-        intrinsic = np.asarray([[cam_param.fx * width,
-                                cam_param.fy * height,
-                                cam_param.cx * width,
-                                cam_param.cy * height]
-                                for cam_param in cam_params], dtype=np.float32)
-        K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
-        c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
-        c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
-        plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
-        plucker_embedding = plucker_embedding[None]
-        plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
-        return plucker_embedding
-def process_pose_params(cam_params, width=672, height=384, original_pose_width=1280, original_pose_height=720, device='cpu'):
-    """Modified from https://github.com/hehao13/CameraCtrl/blob/main/inference.py
-    """
-    cam_params = [Camera(cam_param) for cam_param in cam_params]
-    sample_wh_ratio = width / height
-    pose_wh_ratio = original_pose_width / original_pose_height  # Assuming placeholder ratios, change as needed
-    if pose_wh_ratio > sample_wh_ratio:
-        resized_ori_w = height * pose_wh_ratio
-        for cam_param in cam_params:
-            cam_param.fx = resized_ori_w * cam_param.fx / width
-    else:
-        resized_ori_h = width / pose_wh_ratio
-        for cam_param in cam_params:
-            cam_param.fy = resized_ori_h * cam_param.fy / height
-    intrinsic = np.asarray([[cam_param.fx * width,
-                            cam_param.fy * height,
-                            cam_param.cx * width,
-                            cam_param.cy * height]
-                            for cam_param in cam_params], dtype=np.float32)
-    K = torch.as_tensor(intrinsic)[None]  # [1, 1, 4]
-    c2ws = get_relative_pose(cam_params)  # Assuming this function is defined elsewhere
-    c2ws = torch.as_tensor(c2ws)[None]  # [1, n_frame, 4, 4]
-    plucker_embedding = ray_condition(K, c2ws, height, width, device=device)[0].permute(0, 3, 1, 2).contiguous()  # V, 6, H, W
-    plucker_embedding = plucker_embedding[None]
-    plucker_embedding = rearrange(plucker_embedding, "b f c h w -> b f h w c")[0]
-    return plucker_embedding

videox_fun/dist/__init__.py DELETED Viewed

@@ -1,72 +0,0 @@
-import importlib.util
-from .cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
-from .flux2_xfuser import Flux2MultiGPUsAttnProcessor2_0
-from .flux_xfuser import FluxMultiGPUsAttnProcessor2_0
-from .fsdp import shard_model
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    get_world_group, init_distributed_environment,
-                    initialize_model_parallel, sequence_parallel_all_gather,
-                    sequence_parallel_chunk, set_multi_gpus_devices,
-                    xFuserLongContextAttention)
-from .hunyuanvideo_xfuser import HunyuanVideoMultiGPUsAttnProcessor2_0
-from .qwen_xfuser import QwenImageMultiGPUsAttnProcessor2_0
-from .wan_xfuser import usp_attn_forward, usp_attn_s2v_forward
-from .z_image_xfuser import ZMultiGPUsSingleStreamAttnProcessor
-# The pai_fuser is an internally developed acceleration package, which can be used on PAI.
-if importlib.util.find_spec("paifuser") is not None:
-    # --------------------------------------------------------------- #
-    #   The simple_wrapper is used to solve the problem
-    #   about conflicts between cython and torch.compile
-    # --------------------------------------------------------------- #
-    def simple_wrapper(func):
-        def inner(*args, **kwargs):
-            return func(*args, **kwargs)
-        return inner
-    # --------------------------------------------------------------- #
-    #   Sparse Attention Kernel
-    # --------------------------------------------------------------- #
-    from paifuser.models import parallel_magvit_vae
-    from paifuser.ops import wan_usp_sparse_attention_wrapper
-    from . import wan_xfuser
-    # --------------------------------------------------------------- #
-    #   Sparse Attention
-    # --------------------------------------------------------------- #
-    usp_sparse_attn_wrap_forward = simple_wrapper(wan_usp_sparse_attention_wrapper()(wan_xfuser.usp_attn_forward))
-    wan_xfuser.usp_attn_forward = usp_sparse_attn_wrap_forward
-    usp_attn_forward = usp_sparse_attn_wrap_forward
-    print("Import PAI VAE Turbo and Sparse Attention")
-    # --------------------------------------------------------------- #
-    #   Fast Rope Kernel
-    # --------------------------------------------------------------- #
-    import types
-    import torch
-    from paifuser.ops import (ENABLE_KERNEL, usp_fast_rope_apply_qk,
-                              usp_rope_apply_real_qk)
-    def deepcopy_function(f):
-        return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__)
-    local_rope_apply_qk = deepcopy_function(wan_xfuser.rope_apply_qk)
-    if ENABLE_KERNEL:
-        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
-            if torch.is_grad_enabled():
-                return local_rope_apply_qk(q, k, grid_sizes, freqs)
-            else:
-                return usp_fast_rope_apply_qk(q, k, grid_sizes, freqs)
-    else:
-        def adaptive_fast_usp_rope_apply_qk(q, k, grid_sizes, freqs):
-            return usp_rope_apply_real_qk(q, k, grid_sizes, freqs)
-    wan_xfuser.rope_apply_qk = adaptive_fast_usp_rope_apply_qk
-    rope_apply_qk = adaptive_fast_usp_rope_apply_qk
-    print("Import PAI Fast rope")

videox_fun/dist/cogvideox_xfuser.py DELETED Viewed

@@ -1,93 +0,0 @@
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention import Attention
-from diffusers.models.embeddings import apply_rotary_emb
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    init_distributed_environment, initialize_model_parallel,
-                    xFuserLongContextAttention)
-class CogVideoXMultiGPUsAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        text_seq_length = encoder_hidden_states.size(1)
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # Apply RoPE if needed
-        if image_rotary_emb is not None:
-            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
-            if not attn.is_cross_attention:
-                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
-        img_q = query[:, :, text_seq_length:].transpose(1, 2)
-        txt_q = query[:, :, :text_seq_length].transpose(1, 2)
-        img_k = key[:, :, text_seq_length:].transpose(1, 2)
-        txt_k = key[:, :, :text_seq_length].transpose(1, 2)
-        img_v = value[:, :, text_seq_length:].transpose(1, 2)
-        txt_v = value[:, :, :text_seq_length].transpose(1, 2)
-        hidden_states = xFuserLongContextAttention()(
-            None,
-            img_q, img_k, img_v, dropout_p=0.0, causal=False,
-            joint_tensor_query=txt_q,
-            joint_tensor_key=txt_k,
-            joint_tensor_value=txt_v,
-            joint_strategy='front',
-        )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        encoder_hidden_states, hidden_states = hidden_states.split(
-            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
-        )
-        return hidden_states, encoder_hidden_states

videox_fun/dist/flux2_xfuser.py DELETED Viewed

@@ -1,194 +0,0 @@
-from typing import Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention_processor import Attention
-from .fuser import xFuserLongContextAttention
-def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    query = attn.to_q(hidden_states)
-    key = attn.to_k(hidden_states)
-    value = attn.to_v(hidden_states)
-    encoder_query = encoder_key = encoder_value = None
-    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-    return query, key, value, encoder_query, encoder_key, encoder_value
-def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    return _get_projections(attn, hidden_states, encoder_hidden_states)
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-    sequence_dim: int = 2,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        if sequence_dim == 2:
-            cos = cos[None, None, :, :]
-            sin = sin[None, None, :, :]
-        elif sequence_dim == 1:
-            cos = cos[None, :, None, :]
-            sin = sin[None, :, None, :]
-        else:
-            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        # used for lumina
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class Flux2MultiGPUsAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("Flux2MultiGPUsAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: "FluxAttention",
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        text_seq_len: int = None,
-    ) -> torch.FloatTensor:
-        # Determine which type of attention we're processing
-        is_parallel_self_attn = hasattr(attn, 'to_qkv_mlp_proj') and attn.to_qkv_mlp_proj is not None
-        if is_parallel_self_attn:
-            # Parallel in (QKV + MLP in) projection
-            hidden_states = attn.to_qkv_mlp_proj(hidden_states)
-            qkv, mlp_hidden_states = torch.split(
-                hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
-            )
-            # Handle the attention logic
-            query, key, value = qkv.chunk(3, dim=-1)
-        else:
-            query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
-                attn, hidden_states, encoder_hidden_states
-            )
-        # Common processing for query, key, value
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        # Handle encoder projections (only for standard attention)
-        if not is_parallel_self_attn and attn.added_kv_proj_dim is not None:
-            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
-            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
-            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
-            encoder_query = attn.norm_added_q(encoder_query)
-            encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([encoder_query, query], dim=1)
-            key = torch.cat([encoder_key, key], dim=1)
-            value = torch.cat([encoder_value, value], dim=1)
-        # Apply rotary embeddings
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
-            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
-        if not is_parallel_self_attn and attn.added_kv_proj_dim is not None and text_seq_len is None:
-            text_seq_len = encoder_query.shape[1]
-        txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
-        img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
-        half_dtypes = (torch.float16, torch.bfloat16)
-        def half(x):
-            return x if x.dtype in half_dtypes else x.to(torch.bfloat16)
-        hidden_states = xFuserLongContextAttention()(
-            None,
-            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
-            joint_tensor_query=half(txt_query) if txt_query is not None else None,
-            joint_tensor_key=half(txt_key) if txt_key is not None else None,
-            joint_tensor_value=half(txt_value) if txt_value is not None else None,
-            joint_strategy='front',
-        )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        if is_parallel_self_attn:
-            # Handle the feedforward (FF) logic
-            mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
-            # Concatenate and parallel output projection
-            hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
-            hidden_states = attn.to_out(hidden_states)
-            return hidden_states
-        else:
-            # Split encoder and latent hidden states if encoder was used
-            if encoder_hidden_states is not None:
-                encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
-                    [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
-                )
-                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-            # Project output
-            hidden_states = attn.to_out[0](hidden_states)
-            hidden_states = attn.to_out[1](hidden_states)
-            if encoder_hidden_states is not None:
-                return hidden_states, encoder_hidden_states
-            else:
-                return hidden_states

videox_fun/dist/flux_xfuser.py DELETED Viewed

@@ -1,165 +0,0 @@
-from typing import Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention_processor import Attention
-from .fuser import xFuserLongContextAttention
-def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    query = attn.to_q(hidden_states)
-    key = attn.to_k(hidden_states)
-    value = attn.to_v(hidden_states)
-    encoder_query = encoder_key = encoder_value = None
-    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-    return query, key, value, encoder_query, encoder_key, encoder_value
-def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    return _get_projections(attn, hidden_states, encoder_hidden_states)
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-    sequence_dim: int = 2,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        if sequence_dim == 2:
-            cos = cos[None, None, :, :]
-            sin = sin[None, None, :, :]
-        elif sequence_dim == 1:
-            cos = cos[None, :, None, :]
-            sin = sin[None, :, None, :]
-        else:
-            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        # used for lumina
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class FluxMultiGPUsAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("FluxMultiGPUsAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: "FluxAttention",
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        text_seq_len: int = None,
-    ) -> torch.FloatTensor:
-        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
-            attn, hidden_states, encoder_hidden_states
-        )
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        if attn.added_kv_proj_dim is not None:
-            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
-            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
-            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
-            encoder_query = attn.norm_added_q(encoder_query)
-            encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([encoder_query, query], dim=1)
-            key = torch.cat([encoder_key, key], dim=1)
-            value = torch.cat([encoder_value, value], dim=1)
-        # Apply rotary embeddings
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
-            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
-        if attn.added_kv_proj_dim is not None and text_seq_len is None:
-            text_seq_len = encoder_query.shape[1]
-        txt_query, txt_key, txt_value = query[:, :text_seq_len], key[:, :text_seq_len], value[:, :text_seq_len]
-        img_query, img_key, img_value = query[:, text_seq_len:], key[:, text_seq_len:], value[:, text_seq_len:]
-        half_dtypes = (torch.float16, torch.bfloat16)
-        def half(x):
-            return x if x.dtype in half_dtypes else x.to(torch.bfloat16)
-        hidden_states = xFuserLongContextAttention()(
-            None,
-            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
-            joint_tensor_query=half(txt_query) if txt_query is not None else None,
-            joint_tensor_key=half(txt_key) if txt_key is not None else None,
-            joint_tensor_value=half(txt_value) if txt_value is not None else None,
-            joint_strategy='front',
-        )
-        # Reshape back
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(img_query.dtype)
-        if encoder_hidden_states is not None:
-            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
-                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
-            )
-            hidden_states = attn.to_out[0](hidden_states)
-            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-            return hidden_states, encoder_hidden_states
-        else:
-            return hidden_states

videox_fun/dist/fsdp.py DELETED Viewed

@@ -1,44 +0,0 @@
-# Copyied from https://github.com/Wan-Video/Wan2.1/blob/main/wan/distributed/fsdp.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import gc
-from functools import partial
-import torch
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
-from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
-from torch.distributed.utils import _free_storage
-def shard_model(
-    model,
-    device_id,
-    param_dtype=torch.bfloat16,
-    reduce_dtype=torch.float32,
-    buffer_dtype=torch.float32,
-    process_group=None,
-    sharding_strategy=ShardingStrategy.FULL_SHARD,
-    sync_module_states=True,
-    module_to_wrapper=None,
-):
-    model = FSDP(
-        module=model,
-        process_group=process_group,
-        sharding_strategy=sharding_strategy,
-        auto_wrap_policy=partial(
-            lambda_auto_wrap_policy, lambda_fn=lambda m: m in (model.blocks if module_to_wrapper is None else module_to_wrapper)),
-        mixed_precision=MixedPrecision(
-            param_dtype=param_dtype,
-            reduce_dtype=reduce_dtype,
-            buffer_dtype=buffer_dtype),
-        device_id=device_id,
-        sync_module_states=sync_module_states)
-    return model
-def free_model(model):
-    for m in model.modules():
-        if isinstance(m, FSDP):
-            _free_storage(m._handle.flat_param.data)
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()

videox_fun/dist/fuser.py DELETED Viewed

@@ -1,87 +0,0 @@
-import importlib.util
-import torch
-import torch.distributed as dist
-try:
-    # The pai_fuser is an internally developed acceleration package, which can be used on PAI.
-    if importlib.util.find_spec("paifuser") is not None:
-        import paifuser
-        from paifuser.xfuser.core.distributed import (
-            get_sequence_parallel_rank, get_sequence_parallel_world_size,
-            get_sp_group, get_world_group, init_distributed_environment,
-            initialize_model_parallel, model_parallel_is_initialized)
-        from paifuser.xfuser.core.long_ctx_attention import \
-            xFuserLongContextAttention
-        print("Import PAI DiT Turbo")
-    else:
-        import xfuser
-        from xfuser.core.distributed import (get_sequence_parallel_rank,
-                                             get_sequence_parallel_world_size,
-                                             get_sp_group, get_world_group,
-                                             init_distributed_environment,
-                                             initialize_model_parallel,
-                                             model_parallel_is_initialized)
-        from xfuser.core.long_ctx_attention import xFuserLongContextAttention
-        print("Xfuser import sucessful")
-except Exception as ex:
-    get_sequence_parallel_world_size = None
-    get_sequence_parallel_rank = None
-    xFuserLongContextAttention = None
-    get_sp_group = None
-    get_world_group = None
-    init_distributed_environment = None
-    initialize_model_parallel = None
-def set_multi_gpus_devices(ulysses_degree, ring_degree, classifier_free_guidance_degree=1):
-    if ulysses_degree > 1 or ring_degree > 1 or classifier_free_guidance_degree > 1:
-        if get_sp_group is None:
-            raise RuntimeError("xfuser is not installed.")
-        dist.init_process_group("nccl")
-        print('parallel inference enabled: ulysses_degree=%d ring_degree=%d classifier_free_guidance_degree=% rank=%d world_size=%d' % (
-            ulysses_degree, ring_degree, classifier_free_guidance_degree, dist.get_rank(),
-            dist.get_world_size()))
-        assert dist.get_world_size() == ring_degree * ulysses_degree * classifier_free_guidance_degree, \
-                    "number of GPUs(%d) should be equal to ring_degree * ulysses_degree * classifier_free_guidance_degree." % dist.get_world_size()
-        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
-        initialize_model_parallel(sequence_parallel_degree=ring_degree * ulysses_degree,
-                classifier_free_guidance_degree=classifier_free_guidance_degree,
-                ring_degree=ring_degree,
-                ulysses_degree=ulysses_degree)
-        # device = torch.device("cuda:%d" % dist.get_rank())
-        device = torch.device(f"cuda:{get_world_group().local_rank}")
-        print('rank=%d device=%s' % (get_world_group().rank, str(device)))
-    else:
-        device = "cuda"
-    return device
-def sequence_parallel_chunk(x, dim=1):
-    if get_sequence_parallel_world_size is None or not model_parallel_is_initialized():
-        return x
-    sp_world_size = get_sequence_parallel_world_size()
-    if sp_world_size <= 1:
-        return x
-    sp_rank = get_sequence_parallel_rank()
-    sp_group = get_sp_group()
-    if x.size(1) % sp_world_size != 0:
-        raise ValueError(f"Dim 1 of x ({x.size(1)}) not divisible by SP world size ({sp_world_size})")
-    chunks = torch.chunk(x, sp_world_size, dim=1)
-    x = chunks[sp_rank]
-    return x
-def sequence_parallel_all_gather(x, dim=1):
-    if get_sequence_parallel_world_size is None or not model_parallel_is_initialized():
-        return x
-    sp_world_size = get_sequence_parallel_world_size()
-    if sp_world_size <= 1:
-        return x  # No gathering needed
-    sp_group = get_sp_group()
-    gathered_x = sp_group.all_gather(x, dim=dim)
-    return gathered_x

videox_fun/dist/hunyuanvideo_xfuser.py DELETED Viewed

@@ -1,166 +0,0 @@
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention import Attention
-from diffusers.models.embeddings import apply_rotary_emb
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    init_distributed_environment, initialize_model_parallel,
-                    xFuserLongContextAttention)
-def extract_seqlens_from_mask(attn_mask, text_seq_length):
-    if attn_mask is None:
-        return None
-    if len(attn_mask.shape) == 4:
-        bs, _, _, seq_len = attn_mask.shape
-        if attn_mask.dtype == torch.bool:
-            valid_mask = attn_mask.squeeze(1).squeeze(1)
-        else:
-            valid_mask = ~torch.isinf(attn_mask.squeeze(1).squeeze(1))
-    elif len(attn_mask.shape) == 3:
-        raise ValueError(
-            "attn_mask should be 2D or 4D tensor, but got {}".format(
-                attn_mask.shape))
-    seqlens = valid_mask[:, -text_seq_length:].sum(dim=1)
-    return seqlens
-class HunyuanVideoMultiGPUsAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if xFuserLongContextAttention is not None:
-            try:
-                self.hybrid_seq_parallel_attn = xFuserLongContextAttention()
-            except Exception:
-                self.hybrid_seq_parallel_attn = None
-        else:
-            self.hybrid_seq_parallel_attn = None
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if attn.add_q_proj is None and encoder_hidden_states is not None:
-            hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        # 1. QKV projections
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        # 2. QK normalization
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # 3. Rotational positional embeddings applied to latent stream
-        if image_rotary_emb is not None:
-            if attn.add_q_proj is None and encoder_hidden_states is not None:
-                query = torch.cat(
-                    [
-                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        query[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-                key = torch.cat(
-                    [
-                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        key[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-            else:
-                query = apply_rotary_emb(query, image_rotary_emb)
-                key = apply_rotary_emb(key, image_rotary_emb)
-        # 4. Encoder condition QKV projection and normalization
-        if attn.add_q_proj is not None and encoder_hidden_states is not None:
-            encoder_query = attn.add_q_proj(encoder_hidden_states)
-            encoder_key = attn.add_k_proj(encoder_hidden_states)
-            encoder_value = attn.add_v_proj(encoder_hidden_states)
-            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            if attn.norm_added_q is not None:
-                encoder_query = attn.norm_added_q(encoder_query)
-            if attn.norm_added_k is not None:
-                encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([query, encoder_query], dim=2)
-            key = torch.cat([key, encoder_key], dim=2)
-            value = torch.cat([value, encoder_value], dim=2)
-        # 5. Attention
-        if encoder_hidden_states is not None:
-            text_seq_length = encoder_hidden_states.size(1)
-            q_lens = k_lens = extract_seqlens_from_mask(attention_mask, text_seq_length)
-            img_q = query[:, :, :-text_seq_length].transpose(1, 2)
-            txt_q = query[:, :, -text_seq_length:].transpose(1, 2)
-            img_k = key[:, :, :-text_seq_length].transpose(1, 2)
-            txt_k = key[:, :, -text_seq_length:].transpose(1, 2)
-            img_v = value[:, :, :-text_seq_length].transpose(1, 2)
-            txt_v = value[:, :, -text_seq_length:].transpose(1, 2)
-            hidden_states = torch.zeros_like(query.transpose(1, 2))
-            local_q_length = img_q.size()[1]
-            for i in range(len(q_lens)):
-                hidden_states[i][:local_q_length + q_lens[i]] = self.hybrid_seq_parallel_attn(
-                    None,
-                    img_q[i].unsqueeze(0), img_k[i].unsqueeze(0), img_v[i].unsqueeze(0), dropout_p=0.0, causal=False,
-                    joint_tensor_query=txt_q[i][:q_lens[i]].unsqueeze(0),
-                    joint_tensor_key=txt_k[i][:q_lens[i]].unsqueeze(0),
-                    joint_tensor_value=txt_v[i][:q_lens[i]].unsqueeze(0),
-                    joint_strategy='rear',
-                )
-        else:
-            query = query.transpose(1, 2)
-            key = key.transpose(1, 2)
-            value = value.transpose(1, 2)
-            hidden_states = self.hybrid_seq_parallel_attn(
-                None,
-                query, key, value, dropout_p=0.0, causal=False
-            )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        # 6. Output projection
-        if encoder_hidden_states is not None:
-            hidden_states, encoder_hidden_states = (
-                hidden_states[:, : -encoder_hidden_states.shape[1]],
-                hidden_states[:, -encoder_hidden_states.shape[1] :],
-            )
-            if getattr(attn, "to_out", None) is not None:
-                hidden_states = attn.to_out[0](hidden_states)
-                hidden_states = attn.to_out[1](hidden_states)
-            if getattr(attn, "to_add_out", None) is not None:
-                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-        return hidden_states, encoder_hidden_states

videox_fun/dist/qwen_xfuser.py DELETED Viewed

@@ -1,176 +0,0 @@
-import functools
-import glob
-import json
-import math
-import os
-import types
-import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention
-from diffusers.models.embeddings import TimestepEmbedding, Timesteps
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from torch import nn
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    init_distributed_environment, initialize_model_parallel,
-                    xFuserLongContextAttention)
-def apply_rotary_emb_qwen(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(1)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class QwenImageMultiGPUsAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,  # Image stream
-        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
-        encoder_hidden_states_mask: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        if encoder_hidden_states is None:
-            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
-        seq_txt = encoder_hidden_states.shape[1]
-        # Compute QKV for image stream (sample projections)
-        img_query = attn.to_q(hidden_states)
-        img_key = attn.to_k(hidden_states)
-        img_value = attn.to_v(hidden_states)
-        # Compute QKV for text stream (context projections)
-        txt_query = attn.add_q_proj(encoder_hidden_states)
-        txt_key = attn.add_k_proj(encoder_hidden_states)
-        txt_value = attn.add_v_proj(encoder_hidden_states)
-        # Reshape for multi-head attention
-        img_query = img_query.unflatten(-1, (attn.heads, -1))
-        img_key = img_key.unflatten(-1, (attn.heads, -1))
-        img_value = img_value.unflatten(-1, (attn.heads, -1))
-        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
-        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
-        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
-        # Apply QK normalization
-        if attn.norm_q is not None:
-            img_query = attn.norm_q(img_query)
-        if attn.norm_k is not None:
-            img_key = attn.norm_k(img_key)
-        if attn.norm_added_q is not None:
-            txt_query = attn.norm_added_q(txt_query)
-        if attn.norm_added_k is not None:
-            txt_key = attn.norm_added_k(txt_key)
-        # Apply RoPE
-        if image_rotary_emb is not None:
-            img_freqs, txt_freqs = image_rotary_emb
-            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
-            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
-            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
-            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
-        # Concatenate for joint attention
-        # Order: [text, image]
-        # joint_query = torch.cat([txt_query, img_query], dim=1)
-        # joint_key = torch.cat([txt_key, img_key], dim=1)
-        # joint_value = torch.cat([txt_value, img_value], dim=1)
-        half_dtypes = (torch.float16, torch.bfloat16)
-        def half(x):
-            return x if x.dtype in half_dtypes else x.to(dtype)
-        joint_hidden_states = xFuserLongContextAttention()(
-            None,
-            half(img_query), half(img_key), half(img_value), dropout_p=0.0, causal=False,
-            joint_tensor_query=half(txt_query),
-            joint_tensor_key=half(txt_key),
-            joint_tensor_value=half(txt_value),
-            joint_strategy='front',
-        )
-        # Reshape back
-        joint_hidden_states = joint_hidden_states.flatten(2, 3)
-        joint_hidden_states = joint_hidden_states.to(img_query.dtype)
-        # Split attention outputs back
-        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
-        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
-        # Apply output projections
-        img_attn_output = attn.to_out[0](img_attn_output)
-        if len(attn.to_out) > 1:
-            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
-        txt_attn_output = attn.to_add_out(txt_attn_output)
-        return img_attn_output, txt_attn_output

videox_fun/dist/wan_xfuser.py DELETED Viewed

@@ -1,180 +0,0 @@
-import torch
-import torch.cuda.amp as amp
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    init_distributed_environment, initialize_model_parallel,
-                    xFuserLongContextAttention)
-def pad_freqs(original_tensor, target_len):
-    seq_len, s1, s2 = original_tensor.shape
-    pad_size = target_len - seq_len
-    padding_tensor = torch.ones(
-        pad_size,
-        s1,
-        s2,
-        dtype=original_tensor.dtype,
-        device=original_tensor.device)
-    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
-    return padded_tensor
-@amp.autocast(enabled=False)
-@torch.compiler.disable()
-def rope_apply(x, grid_sizes, freqs):
-    """
-    x:          [B, L, N, C].
-    grid_sizes: [B, 3].
-    freqs:      [M, C // 2].
-    """
-    s, n, c = x.size(1), x.size(2), x.size(3) // 2
-    # split freqs
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
-        seq_len = f * h * w
-        # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :s].to(torch.float32).reshape(
-            s, n, -1, 2))
-        freqs_i = torch.cat([
-            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
-            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
-            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
-        ],
-        dim=-1).reshape(seq_len, 1, -1)
-        # apply rotary embedding
-        sp_size = get_sequence_parallel_world_size()
-        sp_rank = get_sequence_parallel_rank()
-        freqs_i = pad_freqs(freqs_i, s * sp_size)
-        s_per_rank = s
-        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
-                                                       s_per_rank), :, :]
-        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
-        x_i = torch.cat([x_i, x[i, s:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output)
-def rope_apply_qk(q, k, grid_sizes, freqs):
-    q = rope_apply(q, grid_sizes, freqs)
-    k = rope_apply(k, grid_sizes, freqs)
-    return q, k
-def usp_attn_forward(self,
-                     x,
-                     seq_lens,
-                     grid_sizes,
-                     freqs,
-                     dtype=torch.bfloat16,
-                     t=0):
-    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-    half_dtypes = (torch.float16, torch.bfloat16)
-    def half(x):
-        return x if x.dtype in half_dtypes else x.to(dtype)
-    # query, key, value function
-    def qkv_fn(x):
-        q = self.norm_q(self.q(x)).view(b, s, n, d)
-        k = self.norm_k(self.k(x)).view(b, s, n, d)
-        v = self.v(x).view(b, s, n, d)
-        return q, k, v
-    q, k, v = qkv_fn(x)
-    q, k = rope_apply_qk(q, k, grid_sizes, freqs)
-    # TODO: We should use unpaded q,k,v for attention.
-    # k_lens = seq_lens // get_sequence_parallel_world_size()
-    # if k_lens is not None:
-    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
-    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
-    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
-    x = xFuserLongContextAttention()(
-        None,
-        query=half(q),
-        key=half(k),
-        value=half(v),
-        window_size=self.window_size)
-    # TODO: padding after attention.
-    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
-    # output
-    x = x.flatten(2)
-    x = self.o(x)
-    return x
-@amp.autocast(enabled=False)
-@torch.compiler.disable()
-def s2v_rope_apply(x, grid_sizes, freqs):
-    s, n, c = x.size(1), x.size(2), x.size(3) // 2
-    # loop over samples
-    output = []
-    for i, _ in enumerate(x):
-        s = x.size(1)
-        # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
-            s, n, -1, 2))
-        freqs_i = freqs[i]
-        freqs_i_rank = pad_freqs(freqs_i, s)
-        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
-        x_i = torch.cat([x_i, x[i, s:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output).float()
-def s2v_rope_apply_qk(q, k, grid_sizes, freqs):
-    q = s2v_rope_apply(q, grid_sizes, freqs)
-    k = s2v_rope_apply(k, grid_sizes, freqs)
-    return q, k
-def usp_attn_s2v_forward(self,
-                     x,
-                     seq_lens,
-                     grid_sizes,
-                     freqs,
-                     dtype=torch.bfloat16,
-                     t=0):
-    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-    half_dtypes = (torch.float16, torch.bfloat16)
-    def half(x):
-        return x if x.dtype in half_dtypes else x.to(dtype)
-    # query, key, value function
-    def qkv_fn(x):
-        q = self.norm_q(self.q(x)).view(b, s, n, d)
-        k = self.norm_k(self.k(x)).view(b, s, n, d)
-        v = self.v(x).view(b, s, n, d)
-        return q, k, v
-    q, k, v = qkv_fn(x)
-    q, k = s2v_rope_apply_qk(q, k, grid_sizes, freqs)
-    # TODO: We should use unpaded q,k,v for attention.
-    # k_lens = seq_lens // get_sequence_parallel_world_size()
-    # if k_lens is not None:
-    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
-    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
-    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
-    x = xFuserLongContextAttention()(
-        None,
-        query=half(q),
-        key=half(k),
-        value=half(v),
-        window_size=self.window_size)
-    # TODO: padding after attention.
-    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
-    # output
-    x = x.flatten(2)
-    x = self.o(x)
-    return x

videox_fun/dist/z_image_xfuser.py DELETED Viewed

@@ -1,85 +0,0 @@
-import torch
-import torch.cuda.amp as amp
-from typing import Optional
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention import Attention
-from .fuser import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    init_distributed_environment, initialize_model_parallel,
-                    xFuserLongContextAttention)
-class ZMultiGPUsSingleStreamAttnProcessor:
-    """
-    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
-    original Z-ImageAttention module.
-    """
-    _attention_backend = None
-    _parallel_config = None
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
-            )
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        # Apply Norms
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # Apply RoPE
-        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
-            with torch.amp.autocast("cuda", enabled=False):
-                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
-                freqs_cis = freqs_cis.unsqueeze(2)
-                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
-                return x_out.type_as(x_in)  # todo
-        if freqs_cis is not None:
-            query = apply_rotary_emb(query, freqs_cis)
-            key = apply_rotary_emb(key, freqs_cis)
-        # Cast to correct dtype
-        dtype = query.dtype
-        query, key = query.to(dtype), key.to(dtype)
-        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
-        if attention_mask is not None and attention_mask.ndim == 2:
-            attention_mask = attention_mask[:, None, None, :]
-        # Compute joint attention
-        hidden_states = xFuserLongContextAttention()(
-            query,
-            key,
-            value,
-        )
-        # Reshape back
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(dtype)
-        output = attn.to_out[0](hidden_states)
-        if len(attn.to_out) > 1:  # dropout
-            output = attn.to_out[1](output)
-        return output

videox_fun/models/__init__.py DELETED Viewed

@@ -1,131 +0,0 @@
-import importlib.util
-from diffusers import AutoencoderKL
-# from transformers import (AutoProcessor, AutoTokenizer, CLIPImageProcessor,
-#                           CLIPTextModel, CLIPTokenizer,
-#                           CLIPVisionModelWithProjection, LlamaModel,
-#                           LlamaTokenizerFast, LlavaForConditionalGeneration,
-#                           Mistral3ForConditionalGeneration, PixtralProcessor,
-#                           Qwen3ForCausalLM, T5EncoderModel, T5Tokenizer,
-#                           T5TokenizerFast)
-# try:
-#     from transformers import (Qwen2_5_VLConfig,
-#                               Qwen2_5_VLForConditionalGeneration,
-#                               Qwen2Tokenizer, Qwen2VLProcessor)
-# except:
-#     Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer = None, None
-#     Qwen2VLProcessor, Qwen2_5_VLConfig = None, None
-#     print("Your transformers version is too old to load Qwen2_5_VLForConditionalGeneration and Qwen2Tokenizer. If you wish to use QwenImage, please upgrade your transformers package to the latest version.")
-# from .cogvideox_transformer3d import CogVideoXTransformer3DModel
-# from .cogvideox_vae import AutoencoderKLCogVideoX
-# from .fantasytalking_audio_encoder import FantasyTalkingAudioEncoder
-# from .fantasytalking_transformer3d import FantasyTalkingTransformer3DModel
-# from .flux2_image_processor import Flux2ImageProcessor
-# from .flux2_transformer2d import Flux2Transformer2DModel
-# from .flux2_transformer2d_control import Flux2ControlTransformer2DModel
-# from .flux2_vae import AutoencoderKLFlux2
-# from .flux_transformer2d import FluxTransformer2DModel
-# from .hunyuanvideo_transformer3d import HunyuanVideoTransformer3DModel
-# from .hunyuanvideo_vae import AutoencoderKLHunyuanVideo
-# from .qwenimage_transformer2d import QwenImageTransformer2DModel
-# from .qwenimage_vae import AutoencoderKLQwenImage
-# from .wan_audio_encoder import WanAudioEncoder
-# from .wan_image_encoder import CLIPModel
-# from .wan_text_encoder import WanT5EncoderModel
-# from .wan_transformer3d import (Wan2_2Transformer3DModel, WanRMSNorm,
-#                                 WanSelfAttention, WanTransformer3DModel)
-# from .wan_transformer3d_animate import Wan2_2Transformer3DModel_Animate
-# from .wan_transformer3d_s2v import Wan2_2Transformer3DModel_S2V
-# from .wan_transformer3d_vace import VaceWanTransformer3DModel
-# from .wan_vae import AutoencoderKLWan, AutoencoderKLWan_
-# from .wan_vae3_8 import AutoencoderKLWan2_2_, AutoencoderKLWan3_8
-from .z_image_transformer2d import ZImageTransformer2DModel
-from .z_image_transformer2d_control import ZImageControlTransformer2DModel
-# The pai_fuser is an internally developed acceleration package, which can be used on PAI.
-# if importlib.util.find_spec("paifuser") is not None:
-#     # --------------------------------------------------------------- #
-#     #   The simple_wrapper is used to solve the problem
-#     #   about conflicts between cython and torch.compile
-#     # --------------------------------------------------------------- #
-#     def simple_wrapper(func):
-#         def inner(*args, **kwargs):
-#             return func(*args, **kwargs)
-#         return inner
-#     # --------------------------------------------------------------- #
-#     #   VAE Parallel Kernel
-#     # --------------------------------------------------------------- #
-#     from ..dist import parallel_magvit_vae
-#     AutoencoderKLWan_.decode = simple_wrapper(parallel_magvit_vae(0.4, 8)(AutoencoderKLWan_.decode))
-#     AutoencoderKLWan2_2_.decode = simple_wrapper(parallel_magvit_vae(0.4, 16)(AutoencoderKLWan2_2_.decode))
-#     # --------------------------------------------------------------- #
-#     #   Sparse Attention
-#     # --------------------------------------------------------------- #
-#     import torch
-#     from paifuser.ops import wan_sparse_attention_wrapper
-#     WanSelfAttention.forward = simple_wrapper(wan_sparse_attention_wrapper()(WanSelfAttention.forward))
-#     print("Import Sparse Attention")
-#     WanTransformer3DModel.forward = simple_wrapper(WanTransformer3DModel.forward)
-#     # --------------------------------------------------------------- #
-#     #   CFG Skip Turbo
-#     # --------------------------------------------------------------- #
-#     import os
-#     if importlib.util.find_spec("paifuser.accelerator") is not None:
-#         from paifuser.accelerator import (cfg_skip_turbo, disable_cfg_skip,
-#                                           enable_cfg_skip, share_cfg_skip)
-#     else:
-#         from paifuser import (cfg_skip_turbo, disable_cfg_skip,
-#                               enable_cfg_skip, share_cfg_skip)
-#     WanTransformer3DModel.enable_cfg_skip = enable_cfg_skip()(WanTransformer3DModel.enable_cfg_skip)
-#     WanTransformer3DModel.disable_cfg_skip = disable_cfg_skip()(WanTransformer3DModel.disable_cfg_skip)
-#     WanTransformer3DModel.share_cfg_skip = share_cfg_skip()(WanTransformer3DModel.share_cfg_skip)
-#     QwenImageTransformer2DModel.enable_cfg_skip = enable_cfg_skip()(QwenImageTransformer2DModel.enable_cfg_skip)
-#     QwenImageTransformer2DModel.disable_cfg_skip = disable_cfg_skip()(QwenImageTransformer2DModel.disable_cfg_skip)
-#     print("Import CFG Skip Turbo")
-#     # --------------------------------------------------------------- #
-#     #   RMS Norm Kernel
-#     # --------------------------------------------------------------- #
-#     from paifuser.ops import rms_norm_forward
-#     WanRMSNorm.forward = rms_norm_forward
-#     print("Import PAI RMS Fuse")
-#     # --------------------------------------------------------------- #
-#     #   Fast Rope Kernel
-#     # --------------------------------------------------------------- #
-#     import types
-#     import torch
-#     from paifuser.ops import (ENABLE_KERNEL, fast_rope_apply_qk,
-#                               rope_apply_real_qk)
-#     from . import wan_transformer3d
-#     def deepcopy_function(f):
-#         return types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__,closure=f.__closure__)
-#     local_rope_apply_qk = deepcopy_function(wan_transformer3d.rope_apply_qk)
-#     if ENABLE_KERNEL:
-#         def adaptive_fast_rope_apply_qk(q, k, grid_sizes, freqs):
-#             if torch.is_grad_enabled():
-#                 return local_rope_apply_qk(q, k, grid_sizes, freqs)
-#             else:
-#                 return fast_rope_apply_qk(q, k, grid_sizes, freqs)
-#     else:
-#         def adaptive_fast_rope_apply_qk(q, k, grid_sizes, freqs):
-#             return rope_apply_real_qk(q, k, grid_sizes, freqs)
-#     wan_transformer3d.rope_apply_qk = adaptive_fast_rope_apply_qk
-#     rope_apply_qk = adaptive_fast_rope_apply_qk
-#     print("Import PAI Fast rope")

videox_fun/models/attention_utils.py DELETED Viewed

@@ -1,211 +0,0 @@
-import os
-import torch
-import warnings
-try:
-    import flash_attn_interface
-    FLASH_ATTN_3_AVAILABLE = True
-except ModuleNotFoundError:
-    FLASH_ATTN_3_AVAILABLE = False
-try:
-    import flash_attn
-    FLASH_ATTN_2_AVAILABLE = True
-except ModuleNotFoundError:
-    FLASH_ATTN_2_AVAILABLE = False
-try:
-    major, minor = torch.cuda.get_device_capability(0)
-    if f"{major}.{minor}" == "8.0":
-        from sageattention_sm80 import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-    elif f"{major}.{minor}" == "8.6":
-        from sageattention_sm86 import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-    elif f"{major}.{minor}" == "8.9":
-        from sageattention_sm89 import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-    elif f"{major}.{minor}" == "9.0":
-        from sageattention_sm90 import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-    elif major>9:
-        from sageattention_sm120 import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-except:
-    try:
-        from sageattention import sageattn
-        SAGE_ATTENTION_AVAILABLE = True
-    except:
-        sageattn = None
-        SAGE_ATTENTION_AVAILABLE = False
-def flash_attention(
-    q,
-    k,
-    v,
-    q_lens=None,
-    k_lens=None,
-    dropout_p=0.,
-    softmax_scale=None,
-    q_scale=None,
-    causal=False,
-    window_size=(-1, -1),
-    deterministic=False,
-    dtype=torch.bfloat16,
-    version=None,
-):
-    """
-    q:              [B, Lq, Nq, C1].
-    k:              [B, Lk, Nk, C1].
-    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
-    q_lens:         [B].
-    k_lens:         [B].
-    dropout_p:      float. Dropout probability.
-    softmax_scale:  float. The scaling of QK^T before applying softmax.
-    causal:         bool. Whether to apply causal attention mask.
-    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
-    deterministic:  bool. If True, slightly slower and uses more memory.
-    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
-    """
-    half_dtypes = (torch.float16, torch.bfloat16)
-    assert dtype in half_dtypes
-    assert q.device.type == 'cuda' and q.size(-1) <= 256
-    # params
-    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
-    def half(x):
-        return x if x.dtype in half_dtypes else x.to(dtype)
-    # preprocess query
-    if q_lens is None:
-        q = half(q.flatten(0, 1))
-        q_lens = torch.tensor(
-            [lq] * b, dtype=torch.int32).to(
-                device=q.device, non_blocking=True)
-    else:
-        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
-    # preprocess key, value
-    if k_lens is None:
-        k = half(k.flatten(0, 1))
-        v = half(v.flatten(0, 1))
-        k_lens = torch.tensor(
-            [lk] * b, dtype=torch.int32).to(
-                device=k.device, non_blocking=True)
-    else:
-        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
-        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
-    q = q.to(v.dtype)
-    k = k.to(v.dtype)
-    if q_scale is not None:
-        q = q * q_scale
-    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
-        warnings.warn(
-            'Flash attention 3 is not available, use flash attention 2 instead.'
-        )
-    # apply attention
-    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
-        # Note: dropout_p, window_size are not supported in FA3 now.
-        x = flash_attn_interface.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            seqused_q=None,
-            seqused_k=None,
-            max_seqlen_q=lq,
-            max_seqlen_k=lk,
-            softmax_scale=softmax_scale,
-            causal=causal,
-            deterministic=deterministic)[0].unflatten(0, (b, lq))
-    else:
-        assert FLASH_ATTN_2_AVAILABLE
-        x = flash_attn.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            max_seqlen_q=lq,
-            max_seqlen_k=lk,
-            dropout_p=dropout_p,
-            softmax_scale=softmax_scale,
-            causal=causal,
-            window_size=window_size,
-            deterministic=deterministic).unflatten(0, (b, lq))
-    # output
-    return x.type(out_dtype)
-def attention(
-    q,
-    k,
-    v,
-    q_lens=None,
-    k_lens=None,
-    dropout_p=0.,
-    softmax_scale=None,
-    q_scale=None,
-    causal=False,
-    window_size=(-1, -1),
-    deterministic=False,
-    dtype=torch.bfloat16,
-    fa_version=None,
-    attention_type=None,
-    attn_mask=None,
-):
-    attention_type = os.environ.get("VIDEOX_ATTENTION_TYPE", "FLASH_ATTENTION") if attention_type is None else attention_type
-    if torch.is_grad_enabled() and attention_type == "SAGE_ATTENTION":
-        attention_type = "FLASH_ATTENTION"
-    if attention_type == "SAGE_ATTENTION" and SAGE_ATTENTION_AVAILABLE:
-        if q_lens is not None or k_lens is not None:
-            warnings.warn(
-                'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
-            )
-        out = sageattn(
-            q, k, v, attn_mask=attn_mask, tensor_layout="NHD", is_causal=causal, dropout_p=dropout_p)
-    elif attention_type == "FLASH_ATTENTION" and (FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE):
-        return flash_attention(
-            q=q,
-            k=k,
-            v=v,
-            q_lens=q_lens,
-            k_lens=k_lens,
-            dropout_p=dropout_p,
-            softmax_scale=softmax_scale,
-            q_scale=q_scale,
-            causal=causal,
-            window_size=window_size,
-            deterministic=deterministic,
-            dtype=dtype,
-            version=fa_version,
-        )
-    else:
-        if q_lens is not None or k_lens is not None:
-            warnings.warn(
-                'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
-            )
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        out = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
-        out = out.transpose(1, 2).contiguous()
-    return out

videox_fun/models/cache_utils.py DELETED Viewed

@@ -1,80 +0,0 @@
-import numpy as np
-import torch
-def get_teacache_coefficients(model_name):
-    if "wan2.1-t2v-1.3b" in model_name.lower() or "wan2.1-fun-1.3b" in model_name.lower() \
-        or "wan2.1-fun-v1.1-1.3b" in model_name.lower() or "wan2.1-vace-1.3b" in model_name.lower():
-        return [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
-    elif "wan2.1-t2v-14b" in model_name.lower():
-        return [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
-    elif "wan2.1-i2v-14b-480p" in model_name.lower():
-        return [2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
-    elif "wan2.1-i2v-14b-720p" in model_name.lower() or "wan2.1-fun-14b" in model_name.lower() or "wan2.2-fun" in model_name.lower() \
-        or "wan2.2-i2v-a14b" in model_name.lower() or "wan2.2-t2v-a14b" in model_name.lower() or "wan2.2-ti2v-5b" in model_name.lower() \
-        or "wan2.2-s2v" in model_name.lower() or "wan2.1-vace-14b" in model_name.lower() or "wan2.2-vace-fun" in model_name.lower() \
-        or "wan2.2-animate" in model_name.lower():
-        return [8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
-    elif "qwen-image" in model_name.lower():
-        # Copied from https://github.com/chenpipi0807/ComfyUI-TeaCache/blob/main/nodes.py
-        return [-4.50000000e+02, 2.80000000e+02, -4.50000000e+01, 3.20000000e+00, -2.00000000e-02]
-    else:
-        print(f"The model {model_name} is not supported by TeaCache.")
-        return None
-class TeaCache():
-    """
-    Timestep Embedding Aware Cache, a training-free caching approach that estimates and leverages
-    the fluctuating differences among model outputs across timesteps, thereby accelerating the inference.
-    Please refer to:
-    1. https://github.com/ali-vilab/TeaCache.
-    2. Liu, Feng, et al. "Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model." arXiv preprint arXiv:2411.19108 (2024).
-    """
-    def __init__(
-        self,
-        coefficients: list[float],
-        num_steps: int,
-        rel_l1_thresh: float = 0.0,
-        num_skip_start_steps: int = 0,
-        offload: bool = True,
-    ):
-        if num_steps < 1:
-            raise ValueError(f"`num_steps` must be greater than 0 but is {num_steps}.")
-        if rel_l1_thresh < 0:
-            raise ValueError(f"`rel_l1_thresh` must be greater than or equal to 0 but is {rel_l1_thresh}.")
-        if num_skip_start_steps < 0 or num_skip_start_steps > num_steps:
-            raise ValueError(
-                "`num_skip_start_steps` must be great than or equal to 0 and "
-                f"less than or equal to `num_steps={num_steps}` but is {num_skip_start_steps}."
-            )
-        self.coefficients = coefficients
-        self.num_steps = num_steps
-        self.rel_l1_thresh = rel_l1_thresh
-        self.num_skip_start_steps = num_skip_start_steps
-        self.offload = offload
-        self.rescale_func = np.poly1d(self.coefficients)
-        self.cnt = 0
-        self.should_calc = True
-        self.accumulated_rel_l1_distance = 0
-        self.previous_modulated_input = None
-        # Some pipelines concatenate the unconditional and text guide in forward.
-        self.previous_residual = None
-        # Some pipelines perform forward propagation separately on the unconditional and text guide.
-        self.previous_residual_cond = None
-        self.previous_residual_uncond = None
-    @staticmethod
-    def compute_rel_l1_distance(prev: torch.Tensor, cur: torch.Tensor) -> torch.Tensor:
-        rel_l1_distance = (torch.abs(cur - prev).mean()) / torch.abs(prev).mean()
-        return rel_l1_distance.cpu().item()
-    def reset(self):
-        self.cnt = 0
-        self.should_calc = True
-        self.accumulated_rel_l1_distance = 0
-        self.previous_modulated_input = None
-        self.previous_residual = None
-        self.previous_residual_cond = None
-        self.previous_residual_uncond = None

videox_fun/models/cogvideox_transformer3d.py DELETED Viewed

@@ -1,915 +0,0 @@
-# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import json
-import os
-from typing import Any, Dict, Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.attention import Attention, FeedForward
-from diffusers.models.attention_processor import (
-    AttentionProcessor, FusedCogVideoXAttnProcessor2_0)
-from diffusers.models.embeddings import (CogVideoXPatchEmbed,
-                                         TimestepEmbedding, Timesteps,
-                                         get_3d_sincos_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero
-from diffusers.utils import is_torch_version, logging
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from torch import nn
-from ..dist import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    xFuserLongContextAttention)
-from ..dist.cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
-from .attention_utils import attention
-class CogVideoXAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
-    query and key vectors, but does not include spatial normalization.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor = None,
-        image_rotary_emb: torch.Tensor = None,
-    ) -> torch.Tensor:
-        text_seq_length = encoder_hidden_states.size(1)
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-        batch_size, sequence_length, _ = hidden_states.shape
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # Apply RoPE if needed
-        if image_rotary_emb is not None:
-            from diffusers.models.embeddings import apply_rotary_emb
-            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
-            if not attn.is_cross_attention:
-                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-        hidden_states = attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, causal=False
-        )
-        hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        encoder_hidden_states, hidden_states = hidden_states.split(
-            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
-        )
-        return hidden_states, encoder_hidden_states
-class CogVideoXPatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: int = 2,
-        patch_size_t: Optional[int] = None,
-        in_channels: int = 16,
-        embed_dim: int = 1920,
-        text_embed_dim: int = 4096,
-        bias: bool = True,
-        sample_width: int = 90,
-        sample_height: int = 60,
-        sample_frames: int = 49,
-        temporal_compression_ratio: int = 4,
-        max_text_seq_length: int = 226,
-        spatial_interpolation_scale: float = 1.875,
-        temporal_interpolation_scale: float = 1.0,
-        use_positional_embeddings: bool = True,
-        use_learned_positional_embeddings: bool = True,
-    ) -> None:
-        super().__init__()
-        post_patch_height = sample_height // patch_size
-        post_patch_width = sample_width // patch_size
-        post_time_compression_frames = (sample_frames - 1) // temporal_compression_ratio + 1
-        self.num_patches = post_patch_height * post_patch_width * post_time_compression_frames
-        self.post_patch_height = post_patch_height
-        self.post_patch_width = post_patch_width
-        self.post_time_compression_frames = post_time_compression_frames
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.embed_dim = embed_dim
-        self.sample_height = sample_height
-        self.sample_width = sample_width
-        self.sample_frames = sample_frames
-        self.temporal_compression_ratio = temporal_compression_ratio
-        self.max_text_seq_length = max_text_seq_length
-        self.spatial_interpolation_scale = spatial_interpolation_scale
-        self.temporal_interpolation_scale = temporal_interpolation_scale
-        self.use_positional_embeddings = use_positional_embeddings
-        self.use_learned_positional_embeddings = use_learned_positional_embeddings
-        if patch_size_t is None:
-            # CogVideoX 1.0 checkpoints
-            self.proj = nn.Conv2d(
-                in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
-            )
-        else:
-            # CogVideoX 1.5 checkpoints
-            self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim)
-        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
-        if use_positional_embeddings or use_learned_positional_embeddings:
-            persistent = use_learned_positional_embeddings
-            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
-            self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
-    def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
-        post_patch_height = sample_height // self.patch_size
-        post_patch_width = sample_width // self.patch_size
-        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
-        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
-        pos_embedding = get_3d_sincos_pos_embed(
-            self.embed_dim,
-            (post_patch_width, post_patch_height),
-            post_time_compression_frames,
-            self.spatial_interpolation_scale,
-            self.temporal_interpolation_scale,
-        )
-        pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
-        joint_pos_embedding = torch.zeros(
-            1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
-        )
-        joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
-        return joint_pos_embedding
-    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
-        r"""
-        Args:
-            text_embeds (`torch.Tensor`):
-                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
-            image_embeds (`torch.Tensor`):
-                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
-        """
-        text_embeds = self.text_proj(text_embeds)
-        text_batch_size, text_seq_length, text_channels = text_embeds.shape
-        batch_size, num_frames, channels, height, width = image_embeds.shape
-        if self.patch_size_t is None:
-            image_embeds = image_embeds.reshape(-1, channels, height, width)
-            image_embeds = self.proj(image_embeds)
-            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
-            image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
-            image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
-        else:
-            p = self.patch_size
-            p_t = self.patch_size_t
-            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
-            # b, f, h, w, c => b, f // 2, 2, h // 2, 2, w // 2, 2, c
-            image_embeds = image_embeds.reshape(
-                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
-            )
-            # b, f // 2, 2, h // 2, 2, w // 2, 2, c => b, f // 2, h // 2, w // 2, c, 2, 2, 2
-            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
-            image_embeds = self.proj(image_embeds)
-        embeds = torch.cat(
-            [text_embeds, image_embeds], dim=1
-        ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
-        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
-            seq_length = height * width * num_frames // (self.patch_size**2)
-            # pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
-            pos_embeds = self.pos_embedding
-            emb_size = embeds.size()[-1]
-            pos_embeds_without_text = pos_embeds[:, text_seq_length: ].view(1, self.post_time_compression_frames, self.post_patch_height, self.post_patch_width, emb_size)
-            pos_embeds_without_text = pos_embeds_without_text.permute([0, 4, 1, 2, 3])
-            pos_embeds_without_text = F.interpolate(pos_embeds_without_text,size=[self.post_time_compression_frames, height // self.patch_size, width // self.patch_size], mode='trilinear', align_corners=False)
-            pos_embeds_without_text = pos_embeds_without_text.permute([0, 2, 3, 4, 1]).view(1, -1, emb_size)
-            pos_embeds = torch.cat([pos_embeds[:, :text_seq_length], pos_embeds_without_text], dim = 1)
-            pos_embeds = pos_embeds[:, : text_seq_length + seq_length]
-            embeds = embeds + pos_embeds
-        return embeds
-@maybe_allow_in_graph
-class CogVideoXBlock(nn.Module):
-    r"""
-    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
-    Parameters:
-        dim (`int`):
-            The number of channels in the input and output.
-        num_attention_heads (`int`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`):
-            The number of channels in each head.
-        time_embed_dim (`int`):
-            The number of channels in timestep embedding.
-        dropout (`float`, defaults to `0.0`):
-            The dropout probability to use.
-        activation_fn (`str`, defaults to `"gelu-approximate"`):
-            Activation function to be used in feed-forward.
-        attention_bias (`bool`, defaults to `False`):
-            Whether or not to use bias in attention projection layers.
-        qk_norm (`bool`, defaults to `True`):
-            Whether or not to use normalization after query and key projections in Attention.
-        norm_elementwise_affine (`bool`, defaults to `True`):
-            Whether to use learnable elementwise affine parameters for normalization.
-        norm_eps (`float`, defaults to `1e-5`):
-            Epsilon value for normalization layers.
-        final_dropout (`bool` defaults to `False`):
-            Whether to apply a final dropout after the last feed-forward layer.
-        ff_inner_dim (`int`, *optional*, defaults to `None`):
-            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
-        ff_bias (`bool`, defaults to `True`):
-            Whether or not to use bias in Feed-forward layer.
-        attention_out_bias (`bool`, defaults to `True`):
-            Whether or not to use bias in Attention output projection layer.
-    """
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        time_embed_dim: int,
-        dropout: float = 0.0,
-        activation_fn: str = "gelu-approximate",
-        attention_bias: bool = False,
-        qk_norm: bool = True,
-        norm_elementwise_affine: bool = True,
-        norm_eps: float = 1e-5,
-        final_dropout: bool = True,
-        ff_inner_dim: Optional[int] = None,
-        ff_bias: bool = True,
-        attention_out_bias: bool = True,
-    ):
-        super().__init__()
-        # 1. Self Attention
-        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
-        self.attn1 = Attention(
-            query_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            qk_norm="layer_norm" if qk_norm else None,
-            eps=1e-6,
-            bias=attention_bias,
-            out_bias=attention_out_bias,
-            processor=CogVideoXAttnProcessor2_0(),
-        )
-        # 2. Feed Forward
-        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
-        self.ff = FeedForward(
-            dim,
-            dropout=dropout,
-            activation_fn=activation_fn,
-            final_dropout=final_dropout,
-            inner_dim=ff_inner_dim,
-            bias=ff_bias,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        text_seq_length = encoder_hidden_states.size(1)
-        # norm & modulate
-        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
-            hidden_states, encoder_hidden_states, temb
-        )
-        # attention
-        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-        )
-        hidden_states = hidden_states + gate_msa * attn_hidden_states
-        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
-        # norm & modulate
-        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
-            hidden_states, encoder_hidden_states, temb
-        )
-        # feed-forward
-        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
-        ff_output = self.ff(norm_hidden_states)
-        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
-        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
-        return hidden_states, encoder_hidden_states
-class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
-    """
-    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
-    Parameters:
-        num_attention_heads (`int`, defaults to `30`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, defaults to `64`):
-            The number of channels in each head.
-        in_channels (`int`, defaults to `16`):
-            The number of channels in the input.
-        out_channels (`int`, *optional*, defaults to `16`):
-            The number of channels in the output.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        time_embed_dim (`int`, defaults to `512`):
-            Output dimension of timestep embeddings.
-        text_embed_dim (`int`, defaults to `4096`):
-            Input dimension of text embeddings from the text encoder.
-        num_layers (`int`, defaults to `30`):
-            The number of layers of Transformer blocks to use.
-        dropout (`float`, defaults to `0.0`):
-            The dropout probability to use.
-        attention_bias (`bool`, defaults to `True`):
-            Whether or not to use bias in the attention projection layers.
-        sample_width (`int`, defaults to `90`):
-            The width of the input latents.
-        sample_height (`int`, defaults to `60`):
-            The height of the input latents.
-        sample_frames (`int`, defaults to `49`):
-            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
-            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
-            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
-            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
-        patch_size (`int`, defaults to `2`):
-            The size of the patches to use in the patch embedding layer.
-        temporal_compression_ratio (`int`, defaults to `4`):
-            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
-        max_text_seq_length (`int`, defaults to `226`):
-            The maximum sequence length of the input text embeddings.
-        activation_fn (`str`, defaults to `"gelu-approximate"`):
-            Activation function to use in feed-forward.
-        timestep_activation_fn (`str`, defaults to `"silu"`):
-            Activation function to use when generating the timestep embeddings.
-        norm_elementwise_affine (`bool`, defaults to `True`):
-            Whether or not to use elementwise affine in normalization layers.
-        norm_eps (`float`, defaults to `1e-5`):
-            The epsilon value to use in normalization layers.
-        spatial_interpolation_scale (`float`, defaults to `1.875`):
-            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
-        temporal_interpolation_scale (`float`, defaults to `1.0`):
-            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 30,
-        attention_head_dim: int = 64,
-        in_channels: int = 16,
-        out_channels: Optional[int] = 16,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        time_embed_dim: int = 512,
-        text_embed_dim: int = 4096,
-        num_layers: int = 30,
-        dropout: float = 0.0,
-        attention_bias: bool = True,
-        sample_width: int = 90,
-        sample_height: int = 60,
-        sample_frames: int = 49,
-        patch_size: int = 2,
-        patch_size_t: Optional[int] = None,
-        temporal_compression_ratio: int = 4,
-        max_text_seq_length: int = 226,
-        activation_fn: str = "gelu-approximate",
-        timestep_activation_fn: str = "silu",
-        norm_elementwise_affine: bool = True,
-        norm_eps: float = 1e-5,
-        spatial_interpolation_scale: float = 1.875,
-        temporal_interpolation_scale: float = 1.0,
-        use_rotary_positional_embeddings: bool = False,
-        use_learned_positional_embeddings: bool = False,
-        patch_bias: bool = True,
-        add_noise_in_inpaint_model: bool = False,
-    ):
-        super().__init__()
-        inner_dim = num_attention_heads * attention_head_dim
-        self.patch_size_t = patch_size_t
-        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
-            raise ValueError(
-                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
-                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
-                "issue at https://github.com/huggingface/diffusers/issues."
-            )
-        # 1. Patch embedding
-        self.patch_embed = CogVideoXPatchEmbed(
-            patch_size=patch_size,
-            patch_size_t=patch_size_t,
-            in_channels=in_channels,
-            embed_dim=inner_dim,
-            text_embed_dim=text_embed_dim,
-            bias=patch_bias,
-            sample_width=sample_width,
-            sample_height=sample_height,
-            sample_frames=sample_frames,
-            temporal_compression_ratio=temporal_compression_ratio,
-            max_text_seq_length=max_text_seq_length,
-            spatial_interpolation_scale=spatial_interpolation_scale,
-            temporal_interpolation_scale=temporal_interpolation_scale,
-            use_positional_embeddings=not use_rotary_positional_embeddings,
-            use_learned_positional_embeddings=use_learned_positional_embeddings,
-        )
-        self.embedding_dropout = nn.Dropout(dropout)
-        # 2. Time embeddings
-        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
-        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
-        # 3. Define spatio-temporal transformers blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                CogVideoXBlock(
-                    dim=inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    time_embed_dim=time_embed_dim,
-                    dropout=dropout,
-                    activation_fn=activation_fn,
-                    attention_bias=attention_bias,
-                    norm_elementwise_affine=norm_elementwise_affine,
-                    norm_eps=norm_eps,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
-        # 4. Output blocks
-        self.norm_out = AdaLayerNorm(
-            embedding_dim=time_embed_dim,
-            output_dim=2 * inner_dim,
-            norm_elementwise_affine=norm_elementwise_affine,
-            norm_eps=norm_eps,
-            chunk_dim=1,
-        )
-        if patch_size_t is None:
-            # For CogVideox 1.0
-            output_dim = patch_size * patch_size * out_channels
-        else:
-            # For CogVideoX 1.5
-            output_dim = patch_size * patch_size * patch_size_t * out_channels
-        self.proj_out = nn.Linear(inner_dim, output_dim)
-        self.gradient_checkpointing = False
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.set_attn_processor(CogVideoXMultiGPUsAttnProcessor2_0())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
-    def fuse_qkv_projections(self):
-        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        are fused. For cross-attention modules, key and value projection matrices are fused.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        self.original_attn_processors = None
-        for _, attn_processor in self.attn_processors.items():
-            if "Added" in str(attn_processor.__class__.__name__):
-                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
-        self.original_attn_processors = self.attn_processors
-        for module in self.modules():
-            if isinstance(module, Attention):
-                module.fuse_projections(fuse=True)
-        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
-    def unfuse_qkv_projections(self):
-        """Disables the fused QKV projection if enabled.
-        <Tip warning={true}>
-        This API is 🧪 experimental.
-        </Tip>
-        """
-        if self.original_attn_processors is not None:
-            self.set_attn_processor(self.original_attn_processors)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        timestep: Union[int, float, torch.LongTensor],
-        timestep_cond: Optional[torch.Tensor] = None,
-        inpaint_latents: Optional[torch.Tensor] = None,
-        control_latents: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        return_dict: bool = True,
-    ):
-        batch_size, num_frames, channels, height, width = hidden_states.shape
-        if num_frames == 1 and self.patch_size_t is not None:
-            hidden_states = torch.cat([hidden_states, torch.zeros_like(hidden_states)], dim=1)
-            if inpaint_latents is not None:
-                inpaint_latents = torch.concat([inpaint_latents, torch.zeros_like(inpaint_latents)], dim=1)
-            if control_latents is not None:
-                control_latents = torch.concat([control_latents, torch.zeros_like(control_latents)], dim=1)
-            local_num_frames = num_frames + 1
-        else:
-            local_num_frames = num_frames
-        # 1. Time embedding
-        timesteps = timestep
-        t_emb = self.time_proj(timesteps)
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=hidden_states.dtype)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        # 2. Patch embedding
-        if inpaint_latents is not None:
-            hidden_states = torch.concat([hidden_states, inpaint_latents], 2)
-        if control_latents is not None:
-            hidden_states = torch.concat([hidden_states, control_latents], 2)
-        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
-        hidden_states = self.embedding_dropout(hidden_states)
-        text_seq_length = encoder_hidden_states.shape[1]
-        encoder_hidden_states = hidden_states[:, :text_seq_length]
-        hidden_states = hidden_states[:, text_seq_length:]
-        # Context Parallel
-        if self.sp_world_size > 1:
-            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if image_rotary_emb is not None:
-                image_rotary_emb = (
-                    torch.chunk(image_rotary_emb[0], self.sp_world_size, dim=0)[self.sp_world_rank],
-                    torch.chunk(image_rotary_emb[1], self.sp_world_size, dim=0)[self.sp_world_rank]
-                )
-        # 3. Transformer blocks
-        for i, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    emb,
-                    image_rotary_emb,
-                    **ckpt_kwargs,
-                )
-            else:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb=emb,
-                    image_rotary_emb=image_rotary_emb,
-                )
-        if not self.config.use_rotary_positional_embeddings:
-            # CogVideoX-2B
-            hidden_states = self.norm_final(hidden_states)
-        else:
-            # CogVideoX-5B
-            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-            hidden_states = self.norm_final(hidden_states)
-            hidden_states = hidden_states[:, text_seq_length:]
-        # 4. Final block
-        hidden_states = self.norm_out(hidden_states, temb=emb)
-        hidden_states = self.proj_out(hidden_states)
-        if self.sp_world_size > 1:
-            hidden_states = get_sp_group().all_gather(hidden_states, dim=1)
-        # 5. Unpatchify
-        p = self.config.patch_size
-        p_t = self.config.patch_size_t
-        if p_t is None:
-            output = hidden_states.reshape(batch_size, local_num_frames, height // p, width // p, -1, p, p)
-            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
-        else:
-            output = hidden_states.reshape(
-                batch_size, (local_num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
-            )
-            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
-        if num_frames == 1:
-            output = output[:, :num_frames, :]
-        if not return_dict:
-            return (output,)
-        return Transformer2DModelOutput(sample=output)
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                model._convert_deprecated_attention_blocks(state_dict)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    # move the params from meta device to cpu
-                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-                    if len(missing_keys) > 0:
-                        raise ValueError(
-                            f"Cannot load {cls} from {pretrained_model_path} because the following keys are"
-                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
-                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
-                            " those weights or else make sure your checkpoint file is correct."
-                        )
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        if model.state_dict()['patch_embed.proj.weight'].size() != state_dict['patch_embed.proj.weight'].size():
-            new_shape   = model.state_dict()['patch_embed.proj.weight'].size()
-            if len(new_shape) == 5:
-                state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).expand(new_shape).clone()
-                state_dict['patch_embed.proj.weight'][:, :, :-1] = 0
-            elif len(new_shape) == 2:
-                if model.state_dict()['patch_embed.proj.weight'].size()[1] > state_dict['patch_embed.proj.weight'].size()[1]:
-                    model.state_dict()['patch_embed.proj.weight'][:, :state_dict['patch_embed.proj.weight'].size()[1]] = state_dict['patch_embed.proj.weight']
-                    model.state_dict()['patch_embed.proj.weight'][:, state_dict['patch_embed.proj.weight'].size()[1]:] = 0
-                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
-                else:
-                    model.state_dict()['patch_embed.proj.weight'][:, :] = state_dict['patch_embed.proj.weight'][:, :model.state_dict()['patch_embed.proj.weight'].size()[1]]
-                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
-            else:
-                if model.state_dict()['patch_embed.proj.weight'].size()[1] > state_dict['patch_embed.proj.weight'].size()[1]:
-                    model.state_dict()['patch_embed.proj.weight'][:, :state_dict['patch_embed.proj.weight'].size()[1], :, :] = state_dict['patch_embed.proj.weight']
-                    model.state_dict()['patch_embed.proj.weight'][:, state_dict['patch_embed.proj.weight'].size()[1]:, :, :] = 0
-                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
-                else:
-                    model.state_dict()['patch_embed.proj.weight'][:, :, :, :] = state_dict['patch_embed.proj.weight'][:, :model.state_dict()['patch_embed.proj.weight'].size()[1], :, :]
-                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model

videox_fun/models/cogvideox_vae.py DELETED Viewed

@@ -1,1675 +0,0 @@
-# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Dict, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import json
-import os
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.utils import logging
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from diffusers.models.activations import get_activation
-from diffusers.models.downsampling import CogVideoXDownsample3D
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.upsampling import CogVideoXUpsample3D
-from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-class CogVideoXSafeConv3d(nn.Conv3d):
-    r"""
-    A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
-    """
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        memory_count = (
-            (input.shape[0] * input.shape[1] * input.shape[2] * input.shape[3] * input.shape[4]) * 2 / 1024**3
-        )
-        # Set to 2GB, suitable for CuDNN
-        if memory_count > 2:
-            kernel_size = self.kernel_size[0]
-            part_num = int(memory_count / 2) + 1
-            input_chunks = torch.chunk(input, part_num, dim=2)
-            if kernel_size > 1:
-                input_chunks = [input_chunks[0]] + [
-                    torch.cat((input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), dim=2)
-                    for i in range(1, len(input_chunks))
-                ]
-            output_chunks = []
-            for input_chunk in input_chunks:
-                output_chunks.append(super().forward(input_chunk))
-            output = torch.cat(output_chunks, dim=2)
-            return output
-        else:
-            return super().forward(input)
-class CogVideoXCausalConv3d(nn.Module):
-    r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
-    Args:
-        in_channels (`int`): Number of channels in the input tensor.
-        out_channels (`int`): Number of output channels produced by the convolution.
-        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
-        stride (`int`, defaults to `1`): Stride of the convolution.
-        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
-        pad_mode (`str`, defaults to `"constant"`): Padding mode.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int, int]],
-        stride: int = 1,
-        dilation: int = 1,
-        pad_mode: str = "constant",
-    ):
-        super().__init__()
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size,) * 3
-        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
-        # TODO(aryan): configure calculation based on stride and dilation in the future.
-        # Since CogVideoX does not use it, it is currently tailored to "just work" with Mochi
-        time_pad = time_kernel_size - 1
-        height_pad = (height_kernel_size - 1) // 2
-        width_pad = (width_kernel_size - 1) // 2
-        self.pad_mode = pad_mode
-        self.height_pad = height_pad
-        self.width_pad = width_pad
-        self.time_pad = time_pad
-        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
-        self.temporal_dim = 2
-        self.time_kernel_size = time_kernel_size
-        stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
-        dilation = (dilation, 1, 1)
-        self.conv = CogVideoXSafeConv3d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=dilation,
-        )
-    def fake_context_parallel_forward(
-        self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        if self.pad_mode == "replicate":
-            inputs = F.pad(inputs, self.time_causal_padding, mode="replicate")
-        else:
-            kernel_size = self.time_kernel_size
-            if kernel_size > 1:
-                cached_inputs = [conv_cache] if conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
-                inputs = torch.cat(cached_inputs + [inputs], dim=2)
-        return inputs
-    def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None) -> torch.Tensor:
-        inputs = self.fake_context_parallel_forward(inputs, conv_cache)
-        if self.pad_mode == "replicate":
-            conv_cache = None
-        else:
-            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
-            conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
-        output = self.conv(inputs)
-        return output, conv_cache
-class CogVideoXSpatialNorm3D(nn.Module):
-    r"""
-    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. This implementation is specific
-    to 3D-video like data.
-    CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model.
-    Args:
-        f_channels (`int`):
-            The number of channels for input to group normalization layer, and output of the spatial norm layer.
-        zq_channels (`int`):
-            The number of channels for the quantized vector as described in the paper.
-        groups (`int`):
-            Number of groups to separate the channels into for group normalization.
-    """
-    def __init__(
-        self,
-        f_channels: int,
-        zq_channels: int,
-        groups: int = 32,
-    ):
-        super().__init__()
-        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
-        self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
-        self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
-    def forward(
-        self, f: torch.Tensor, zq: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None
-    ) -> torch.Tensor:
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        if f.shape[2] > 1 and f.shape[2] % 2 == 1:
-            f_first, f_rest = f[:, :, :1], f[:, :, 1:]
-            f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
-            z_first, z_rest = zq[:, :, :1], zq[:, :, 1:]
-            z_first = F.interpolate(z_first, size=f_first_size)
-            z_rest = F.interpolate(z_rest, size=f_rest_size)
-            zq = torch.cat([z_first, z_rest], dim=2)
-        else:
-            zq = F.interpolate(zq, size=f.shape[-3:])
-        conv_y, new_conv_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
-        conv_b, new_conv_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
-        norm_f = self.norm_layer(f)
-        new_f = norm_f * conv_y + conv_b
-        return new_f, new_conv_cache
-class CogVideoXUpsample3D(nn.Module):
-    r"""
-    A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase.
-    Args:
-        in_channels (`int`):
-            Number of channels in the input image.
-        out_channels (`int`):
-            Number of channels produced by the convolution.
-        kernel_size (`int`, defaults to `3`):
-            Size of the convolving kernel.
-        stride (`int`, defaults to `1`):
-            Stride of the convolution.
-        padding (`int`, defaults to `1`):
-            Padding added to all four sides of the input.
-        compress_time (`bool`, defaults to `False`):
-            Whether or not to compress the time dimension.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: int = 1,
-        compress_time: bool = False,
-    ) -> None:
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.compress_time = compress_time
-        self.auto_split_process = True
-        self.first_frame_flag = False
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        if self.compress_time:
-            if self.auto_split_process:
-                if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
-                    # split first frame
-                    x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
-                    x_first = F.interpolate(x_first, scale_factor=2.0)
-                    x_rest = F.interpolate(x_rest, scale_factor=2.0)
-                    x_first = x_first[:, :, None, :, :]
-                    inputs = torch.cat([x_first, x_rest], dim=2)
-                elif inputs.shape[2] > 1:
-                    inputs = F.interpolate(inputs, scale_factor=2.0)
-                else:
-                    inputs = inputs.squeeze(2)
-                    inputs = F.interpolate(inputs, scale_factor=2.0)
-                    inputs = inputs[:, :, None, :, :]
-            else:
-                if self.first_frame_flag:
-                    inputs = inputs.squeeze(2)
-                    inputs = F.interpolate(inputs, scale_factor=2.0)
-                    inputs = inputs[:, :, None, :, :]
-                else:
-                    inputs = F.interpolate(inputs, scale_factor=2.0)
-        else:
-            # only interpolate 2D
-            b, c, t, h, w = inputs.shape
-            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-            inputs = F.interpolate(inputs, scale_factor=2.0)
-            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
-        b, c, t, h, w = inputs.shape
-        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        inputs = self.conv(inputs)
-        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
-        return inputs
-class CogVideoXResnetBlock3D(nn.Module):
-    r"""
-    A 3D ResNet block used in the CogVideoX model.
-    Args:
-        in_channels (`int`):
-            Number of input channels.
-        out_channels (`int`, *optional*):
-            Number of output channels. If None, defaults to `in_channels`.
-        dropout (`float`, defaults to `0.0`):
-            Dropout rate.
-        temb_channels (`int`, defaults to `512`):
-            Number of time embedding channels.
-        groups (`int`, defaults to `32`):
-            Number of groups to separate the channels into for group normalization.
-        eps (`float`, defaults to `1e-6`):
-            Epsilon value for normalization layers.
-        non_linearity (`str`, defaults to `"swish"`):
-            Activation function to use.
-        conv_shortcut (bool, defaults to `False`):
-            Whether or not to use a convolution shortcut.
-        spatial_norm_dim (`int`, *optional*):
-            The dimension to use for spatial norm if it is to be used instead of group norm.
-        pad_mode (str, defaults to `"first"`):
-            Padding mode.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: Optional[int] = None,
-        dropout: float = 0.0,
-        temb_channels: int = 512,
-        groups: int = 32,
-        eps: float = 1e-6,
-        non_linearity: str = "swish",
-        conv_shortcut: bool = False,
-        spatial_norm_dim: Optional[int] = None,
-        pad_mode: str = "first",
-    ):
-        super().__init__()
-        out_channels = out_channels or in_channels
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.nonlinearity = get_activation(non_linearity)
-        self.use_conv_shortcut = conv_shortcut
-        self.spatial_norm_dim = spatial_norm_dim
-        if spatial_norm_dim is None:
-            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
-            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
-        else:
-            self.norm1 = CogVideoXSpatialNorm3D(
-                f_channels=in_channels,
-                zq_channels=spatial_norm_dim,
-                groups=groups,
-            )
-            self.norm2 = CogVideoXSpatialNorm3D(
-                f_channels=out_channels,
-                zq_channels=spatial_norm_dim,
-                groups=groups,
-            )
-        self.conv1 = CogVideoXCausalConv3d(
-            in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
-        )
-        if temb_channels > 0:
-            self.temb_proj = nn.Linear(in_features=temb_channels, out_features=out_channels)
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = CogVideoXCausalConv3d(
-            in_channels=out_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
-        )
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = CogVideoXCausalConv3d(
-                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
-                )
-            else:
-                self.conv_shortcut = CogVideoXSafeConv3d(
-                    in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0
-                )
-    def forward(
-        self,
-        inputs: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        zq: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        hidden_states = inputs
-        if zq is not None:
-            hidden_states, new_conv_cache["norm1"] = self.norm1(hidden_states, zq, conv_cache=conv_cache.get("norm1"))
-        else:
-            hidden_states = self.norm1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states, new_conv_cache["conv1"] = self.conv1(hidden_states, conv_cache=conv_cache.get("conv1"))
-        if temb is not None:
-            hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
-        if zq is not None:
-            hidden_states, new_conv_cache["norm2"] = self.norm2(hidden_states, zq, conv_cache=conv_cache.get("norm2"))
-        else:
-            hidden_states = self.norm2(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states, new_conv_cache["conv2"] = self.conv2(hidden_states, conv_cache=conv_cache.get("conv2"))
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                inputs, new_conv_cache["conv_shortcut"] = self.conv_shortcut(
-                    inputs, conv_cache=conv_cache.get("conv_shortcut")
-                )
-            else:
-                inputs = self.conv_shortcut(inputs)
-        hidden_states = hidden_states + inputs
-        return hidden_states, new_conv_cache
-class CogVideoXDownBlock3D(nn.Module):
-    r"""
-    A downsampling block used in the CogVideoX model.
-    Args:
-        in_channels (`int`):
-            Number of input channels.
-        out_channels (`int`, *optional*):
-            Number of output channels. If None, defaults to `in_channels`.
-        temb_channels (`int`, defaults to `512`):
-            Number of time embedding channels.
-        num_layers (`int`, defaults to `1`):
-            Number of resnet layers.
-        dropout (`float`, defaults to `0.0`):
-            Dropout rate.
-        resnet_eps (`float`, defaults to `1e-6`):
-            Epsilon value for normalization layers.
-        resnet_act_fn (`str`, defaults to `"swish"`):
-            Activation function to use.
-        resnet_groups (`int`, defaults to `32`):
-            Number of groups to separate the channels into for group normalization.
-        add_downsample (`bool`, defaults to `True`):
-            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
-        compress_time (`bool`, defaults to `False`):
-            Whether or not to downsample across temporal dimension.
-        pad_mode (str, defaults to `"first"`):
-            Padding mode.
-    """
-    _supports_gradient_checkpointing = True
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        add_downsample: bool = True,
-        downsample_padding: int = 0,
-        compress_time: bool = False,
-        pad_mode: str = "first",
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            in_channel = in_channels if i == 0 else out_channels
-            resnets.append(
-                CogVideoXResnetBlock3D(
-                    in_channels=in_channel,
-                    out_channels=out_channels,
-                    dropout=dropout,
-                    temb_channels=temb_channels,
-                    groups=resnet_groups,
-                    eps=resnet_eps,
-                    non_linearity=resnet_act_fn,
-                    pad_mode=pad_mode,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        self.downsamplers = None
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    CogVideoXDownsample3D(
-                        out_channels, out_channels, padding=downsample_padding, compress_time=compress_time
-                    )
-                ]
-            )
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        zq: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        r"""Forward method of the `CogVideoXDownBlock3D` class."""
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            conv_cache_key = f"resnet_{i}"
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                    zq,
-                    conv_cache.get(conv_cache_key),
-                )
-            else:
-                hidden_states, new_conv_cache[conv_cache_key] = resnet(
-                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
-                )
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-        return hidden_states, new_conv_cache
-class CogVideoXMidBlock3D(nn.Module):
-    r"""
-    A middle block used in the CogVideoX model.
-    Args:
-        in_channels (`int`):
-            Number of input channels.
-        temb_channels (`int`, defaults to `512`):
-            Number of time embedding channels.
-        dropout (`float`, defaults to `0.0`):
-            Dropout rate.
-        num_layers (`int`, defaults to `1`):
-            Number of resnet layers.
-        resnet_eps (`float`, defaults to `1e-6`):
-            Epsilon value for normalization layers.
-        resnet_act_fn (`str`, defaults to `"swish"`):
-            Activation function to use.
-        resnet_groups (`int`, defaults to `32`):
-            Number of groups to separate the channels into for group normalization.
-        spatial_norm_dim (`int`, *optional*):
-            The dimension to use for spatial norm if it is to be used instead of group norm.
-        pad_mode (str, defaults to `"first"`):
-            Padding mode.
-    """
-    _supports_gradient_checkpointing = True
-    def __init__(
-        self,
-        in_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        spatial_norm_dim: Optional[int] = None,
-        pad_mode: str = "first",
-    ):
-        super().__init__()
-        resnets = []
-        for _ in range(num_layers):
-            resnets.append(
-                CogVideoXResnetBlock3D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    dropout=dropout,
-                    temb_channels=temb_channels,
-                    groups=resnet_groups,
-                    eps=resnet_eps,
-                    spatial_norm_dim=spatial_norm_dim,
-                    non_linearity=resnet_act_fn,
-                    pad_mode=pad_mode,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        zq: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        r"""Forward method of the `CogVideoXMidBlock3D` class."""
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            conv_cache_key = f"resnet_{i}"
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
-                )
-            else:
-                hidden_states, new_conv_cache[conv_cache_key] = resnet(
-                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
-                )
-        return hidden_states, new_conv_cache
-class CogVideoXUpBlock3D(nn.Module):
-    r"""
-    An upsampling block used in the CogVideoX model.
-    Args:
-        in_channels (`int`):
-            Number of input channels.
-        out_channels (`int`, *optional*):
-            Number of output channels. If None, defaults to `in_channels`.
-        temb_channels (`int`, defaults to `512`):
-            Number of time embedding channels.
-        dropout (`float`, defaults to `0.0`):
-            Dropout rate.
-        num_layers (`int`, defaults to `1`):
-            Number of resnet layers.
-        resnet_eps (`float`, defaults to `1e-6`):
-            Epsilon value for normalization layers.
-        resnet_act_fn (`str`, defaults to `"swish"`):
-            Activation function to use.
-        resnet_groups (`int`, defaults to `32`):
-            Number of groups to separate the channels into for group normalization.
-        spatial_norm_dim (`int`, defaults to `16`):
-            The dimension to use for spatial norm if it is to be used instead of group norm.
-        add_upsample (`bool`, defaults to `True`):
-            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
-        compress_time (`bool`, defaults to `False`):
-            Whether or not to downsample across temporal dimension.
-        pad_mode (str, defaults to `"first"`):
-            Padding mode.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        temb_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        spatial_norm_dim: int = 16,
-        add_upsample: bool = True,
-        upsample_padding: int = 1,
-        compress_time: bool = False,
-        pad_mode: str = "first",
-    ):
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            in_channel = in_channels if i == 0 else out_channels
-            resnets.append(
-                CogVideoXResnetBlock3D(
-                    in_channels=in_channel,
-                    out_channels=out_channels,
-                    dropout=dropout,
-                    temb_channels=temb_channels,
-                    groups=resnet_groups,
-                    eps=resnet_eps,
-                    non_linearity=resnet_act_fn,
-                    spatial_norm_dim=spatial_norm_dim,
-                    pad_mode=pad_mode,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        self.upsamplers = None
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [
-                    CogVideoXUpsample3D(
-                        out_channels, out_channels, padding=upsample_padding, compress_time=compress_time
-                    )
-                ]
-            )
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        zq: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        r"""Forward method of the `CogVideoXUpBlock3D` class."""
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            conv_cache_key = f"resnet_{i}"
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def create_forward(*inputs):
-                        return module(*inputs)
-                    return create_forward
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(resnet),
-                    hidden_states,
-                    temb,
-                    zq,
-                    conv_cache.get(conv_cache_key),
-                )
-            else:
-                hidden_states, new_conv_cache[conv_cache_key] = resnet(
-                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
-                )
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states, new_conv_cache
-class CogVideoXEncoder3D(nn.Module):
-    r"""
-    The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
-            options.
-        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
-            The number of output channels for each block.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups for normalization.
-    """
-    _supports_gradient_checkpointing = True
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 16,
-        down_block_types: Tuple[str, ...] = (
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-        ),
-        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
-        layers_per_block: int = 3,
-        act_fn: str = "silu",
-        norm_eps: float = 1e-6,
-        norm_num_groups: int = 32,
-        dropout: float = 0.0,
-        pad_mode: str = "first",
-        temporal_compression_ratio: float = 4,
-    ):
-        super().__init__()
-        # log2 of temporal_compress_times
-        temporal_compress_level = int(np.log2(temporal_compression_ratio))
-        self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
-        self.down_blocks = nn.ModuleList([])
-        # down blocks
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            compress_time = i < temporal_compress_level
-            if down_block_type == "CogVideoXDownBlock3D":
-                down_block = CogVideoXDownBlock3D(
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    temb_channels=0,
-                    dropout=dropout,
-                    num_layers=layers_per_block,
-                    resnet_eps=norm_eps,
-                    resnet_act_fn=act_fn,
-                    resnet_groups=norm_num_groups,
-                    add_downsample=not is_final_block,
-                    compress_time=compress_time,
-                )
-            else:
-                raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`")
-            self.down_blocks.append(down_block)
-        # mid block
-        self.mid_block = CogVideoXMidBlock3D(
-            in_channels=block_out_channels[-1],
-            temb_channels=0,
-            dropout=dropout,
-            num_layers=2,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            resnet_groups=norm_num_groups,
-            pad_mode=pad_mode,
-        )
-        self.norm_out = nn.GroupNorm(norm_num_groups, block_out_channels[-1], eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = CogVideoXCausalConv3d(
-            block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode
-        )
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        sample: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        r"""The forward method of the `CogVideoXEncoder3D` class."""
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            # 1. Down
-            for i, down_block in enumerate(self.down_blocks):
-                conv_cache_key = f"down_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(down_block),
-                    hidden_states,
-                    temb,
-                    None,
-                    conv_cache.get(conv_cache_key),
-                )
-            # 2. Mid
-            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block),
-                hidden_states,
-                temb,
-                None,
-                conv_cache.get("mid_block"),
-            )
-        else:
-            # 1. Down
-            for i, down_block in enumerate(self.down_blocks):
-                conv_cache_key = f"down_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = down_block(
-                    hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key)
-                )
-            # 2. Mid
-            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
-                hidden_states, temb, None, conv_cache=conv_cache.get("mid_block")
-            )
-        # 3. Post-process
-        hidden_states = self.norm_out(hidden_states)
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
-        return hidden_states, new_conv_cache
-class CogVideoXDecoder3D(nn.Module):
-    r"""
-    The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
-    sample.
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
-        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
-            The number of output channels for each block.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups for normalization.
-    """
-    _supports_gradient_checkpointing = True
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 3,
-        up_block_types: Tuple[str, ...] = (
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-        ),
-        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
-        layers_per_block: int = 3,
-        act_fn: str = "silu",
-        norm_eps: float = 1e-6,
-        norm_num_groups: int = 32,
-        dropout: float = 0.0,
-        pad_mode: str = "first",
-        temporal_compression_ratio: float = 4,
-    ):
-        super().__init__()
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        self.conv_in = CogVideoXCausalConv3d(
-            in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode
-        )
-        # mid block
-        self.mid_block = CogVideoXMidBlock3D(
-            in_channels=reversed_block_out_channels[0],
-            temb_channels=0,
-            num_layers=2,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            resnet_groups=norm_num_groups,
-            spatial_norm_dim=in_channels,
-            pad_mode=pad_mode,
-        )
-        # up blocks
-        self.up_blocks = nn.ModuleList([])
-        output_channel = reversed_block_out_channels[0]
-        temporal_compress_level = int(np.log2(temporal_compression_ratio))
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            compress_time = i < temporal_compress_level
-            if up_block_type == "CogVideoXUpBlock3D":
-                up_block = CogVideoXUpBlock3D(
-                    in_channels=prev_output_channel,
-                    out_channels=output_channel,
-                    temb_channels=0,
-                    dropout=dropout,
-                    num_layers=layers_per_block + 1,
-                    resnet_eps=norm_eps,
-                    resnet_act_fn=act_fn,
-                    resnet_groups=norm_num_groups,
-                    spatial_norm_dim=in_channels,
-                    add_upsample=not is_final_block,
-                    compress_time=compress_time,
-                    pad_mode=pad_mode,
-                )
-                prev_output_channel = output_channel
-            else:
-                raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`")
-            self.up_blocks.append(up_block)
-        self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[-1], in_channels, groups=norm_num_groups)
-        self.conv_act = nn.SiLU()
-        self.conv_out = CogVideoXCausalConv3d(
-            reversed_block_out_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode
-        )
-        self.gradient_checkpointing = False
-    def forward(
-        self,
-        sample: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
-    ) -> torch.Tensor:
-        r"""The forward method of the `CogVideoXDecoder3D` class."""
-        new_conv_cache = {}
-        conv_cache = conv_cache or {}
-        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            # 1. Mid
-            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(self.mid_block),
-                hidden_states,
-                temb,
-                sample,
-                conv_cache.get("mid_block"),
-            )
-            # 2. Up
-            for i, up_block in enumerate(self.up_blocks):
-                conv_cache_key = f"up_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(up_block),
-                    hidden_states,
-                    temb,
-                    sample,
-                    conv_cache.get(conv_cache_key),
-                )
-        else:
-            # 1. Mid
-            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
-                hidden_states, temb, sample, conv_cache=conv_cache.get("mid_block")
-            )
-            # 2. Up
-            for i, up_block in enumerate(self.up_blocks):
-                conv_cache_key = f"up_block_{i}"
-                hidden_states, new_conv_cache[conv_cache_key] = up_block(
-                    hidden_states, temb, sample, conv_cache=conv_cache.get(conv_cache_key)
-                )
-        # 3. Post-process
-        hidden_states, new_conv_cache["norm_out"] = self.norm_out(
-            hidden_states, sample, conv_cache=conv_cache.get("norm_out")
-        )
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
-        return hidden_states, new_conv_cache
-class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
-    [CogVideoX](https://github.com/THUDM/CogVideo).
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-        force_upcast (`bool`, *optional*, default to `True`):
-            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
-            can be fine-tuned / trained to a lower range without loosing too much precision in which case
-            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
-    """
-    _supports_gradient_checkpointing = True
-    _no_split_modules = ["CogVideoXResnetBlock3D"]
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = (
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-            "CogVideoXDownBlock3D",
-        ),
-        up_block_types: Tuple[str] = (
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-            "CogVideoXUpBlock3D",
-        ),
-        block_out_channels: Tuple[int] = (128, 256, 256, 512),
-        latent_channels: int = 16,
-        layers_per_block: int = 3,
-        act_fn: str = "silu",
-        norm_eps: float = 1e-6,
-        norm_num_groups: int = 32,
-        temporal_compression_ratio: float = 4,
-        sample_height: int = 480,
-        sample_width: int = 720,
-        scaling_factor: float = 1.15258426,
-        shift_factor: Optional[float] = None,
-        latents_mean: Optional[Tuple[float]] = None,
-        latents_std: Optional[Tuple[float]] = None,
-        force_upcast: float = True,
-        use_quant_conv: bool = False,
-        use_post_quant_conv: bool = False,
-        invert_scale_latents: bool = False,
-    ):
-        super().__init__()
-        self.encoder = CogVideoXEncoder3D(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_eps=norm_eps,
-            norm_num_groups=norm_num_groups,
-            temporal_compression_ratio=temporal_compression_ratio,
-        )
-        self.decoder = CogVideoXDecoder3D(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_eps=norm_eps,
-            norm_num_groups=norm_num_groups,
-            temporal_compression_ratio=temporal_compression_ratio,
-        )
-        self.quant_conv = CogVideoXSafeConv3d(2 * out_channels, 2 * out_channels, 1) if use_quant_conv else None
-        self.post_quant_conv = CogVideoXSafeConv3d(out_channels, out_channels, 1) if use_post_quant_conv else None
-        self.use_slicing = False
-        self.use_tiling = False
-        self.auto_split_process = False
-        # Can be increased to decode more latent frames at once, but comes at a reasonable memory cost and it is not
-        # recommended because the temporal parts of the VAE, here, are tricky to understand.
-        # If you decode X latent frames together, the number of output frames is:
-        #     (X + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) => X + 6 frames
-        #
-        # Example with num_latent_frames_batch_size = 2:
-        #     - 12 latent frames: (0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11) are processed together
-        #         => (12 // 2 frame slices) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
-        #         => 6 * 8 = 48 frames
-        #     - 13 latent frames: (0, 1, 2) (special case), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12) are processed together
-        #         => (1 frame slice) * ((3 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) +
-        #            ((13 - 3) // 2) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
-        #         => 1 * 9 + 5 * 8 = 49 frames
-        # It has been implemented this way so as to not have "magic values" in the code base that would be hard to explain. Note that
-        # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
-        # number of temporal frames.
-        self.num_latent_frames_batch_size = 2
-        self.num_sample_frames_batch_size = 8
-        # We make the minimum height and width of sample for tiling half that of the generally supported
-        self.tile_sample_min_height = sample_height // 2
-        self.tile_sample_min_width = sample_width // 2
-        self.tile_latent_min_height = int(
-            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
-        )
-        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
-        # These are experimental overlap factors that were chosen based on experimentation and seem to work best for
-        # 720x480 (WxH) resolution. The above resolution is the strongly recommended generation resolution in CogVideoX
-        # and so the tiling implementation has only been tested on those specific resolutions.
-        self.tile_overlap_factor_height = 1 / 6
-        self.tile_overlap_factor_width = 1 / 5
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
-            module.gradient_checkpointing = value
-    def enable_tiling(
-        self,
-        tile_sample_min_height: Optional[int] = None,
-        tile_sample_min_width: Optional[int] = None,
-        tile_overlap_factor_height: Optional[float] = None,
-        tile_overlap_factor_width: Optional[float] = None,
-    ) -> None:
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        Args:
-            tile_sample_min_height (`int`, *optional*):
-                The minimum height required for a sample to be separated into tiles across the height dimension.
-            tile_sample_min_width (`int`, *optional*):
-                The minimum width required for a sample to be separated into tiles across the width dimension.
-            tile_overlap_factor_height (`int`, *optional*):
-                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
-                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
-                value might cause more tiles to be processed leading to slow down of the decoding process.
-            tile_overlap_factor_width (`int`, *optional*):
-                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
-                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
-                value might cause more tiles to be processed leading to slow down of the decoding process.
-        """
-        self.use_tiling = True
-        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
-        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
-        self.tile_latent_min_height = int(
-            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
-        )
-        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
-        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
-    def disable_tiling(self) -> None:
-        r"""
-        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_tiling = False
-    def enable_slicing(self) -> None:
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-    def disable_slicing(self) -> None:
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-    def _set_first_frame(self):
-        for name, module in self.named_modules():
-            if isinstance(module, CogVideoXUpsample3D):
-                module.auto_split_process = False
-                module.first_frame_flag = True
-    def _set_rest_frame(self):
-        for name, module in self.named_modules():
-            if isinstance(module, CogVideoXUpsample3D):
-                module.auto_split_process = False
-                module.first_frame_flag = False
-    def enable_auto_split_process(self) -> None:
-        self.auto_split_process = True
-        for name, module in self.named_modules():
-            if isinstance(module, CogVideoXUpsample3D):
-                module.auto_split_process = True
-    def disable_auto_split_process(self) -> None:
-        self.auto_split_process = False
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = x.shape
-        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
-            return self.tiled_encode(x)
-        frame_batch_size = self.num_sample_frames_batch_size
-        # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-        # As the extra single frame is handled inside the loop, it is not required to round up here.
-        num_batches = max(num_frames // frame_batch_size, 1)
-        conv_cache = None
-        enc = []
-        for i in range(num_batches):
-            remaining_frames = num_frames % frame_batch_size
-            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
-            end_frame = frame_batch_size * (i + 1) + remaining_frames
-            x_intermediate = x[:, :, start_frame:end_frame]
-            x_intermediate, conv_cache = self.encoder(x_intermediate, conv_cache=conv_cache)
-            if self.quant_conv is not None:
-                x_intermediate = self.quant_conv(x_intermediate)
-            enc.append(x_intermediate)
-        enc = torch.cat(enc, dim=2)
-        return enc
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        """
-        Encode a batch of images into latents.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-                The latent representations of the encoded videos. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = z.shape
-        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
-            return self.tiled_decode(z, return_dict=return_dict)
-        if self.auto_split_process:
-            frame_batch_size = self.num_latent_frames_batch_size
-            num_batches = max(num_frames // frame_batch_size, 1)
-            conv_cache = None
-            dec = []
-            for i in range(num_batches):
-                remaining_frames = num_frames % frame_batch_size
-                start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
-                end_frame = frame_batch_size * (i + 1) + remaining_frames
-                z_intermediate = z[:, :, start_frame:end_frame]
-                if self.post_quant_conv is not None:
-                    z_intermediate = self.post_quant_conv(z_intermediate)
-                z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
-                dec.append(z_intermediate)
-        else:
-            conv_cache = None
-            start_frame = 0
-            end_frame = 1
-            dec = []
-            self._set_first_frame()
-            z_intermediate = z[:, :, start_frame:end_frame]
-            if self.post_quant_conv is not None:
-                z_intermediate = self.post_quant_conv(z_intermediate)
-            z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
-            dec.append(z_intermediate)
-            self._set_rest_frame()
-            start_frame = end_frame
-            end_frame += self.num_latent_frames_batch_size
-            while start_frame < num_frames:
-                z_intermediate = z[:, :, start_frame:end_frame]
-                if self.post_quant_conv is not None:
-                    z_intermediate = self.post_quant_conv(z_intermediate)
-                z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
-                dec.append(z_intermediate)
-                start_frame = end_frame
-                end_frame += self.num_latent_frames_batch_size
-        dec = torch.cat(dec, dim=2)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        """
-        Decode a batch of images.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
-                y / blend_extent
-            )
-        return b
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
-                x / blend_extent
-            )
-        return b
-    def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
-        r"""Encode a batch of images using a tiled encoder.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-        Args:
-            x (`torch.Tensor`): Input batch of videos.
-        Returns:
-            `torch.Tensor`:
-                The latent representation of the encoded videos.
-        """
-        # For a rough memory estimate, take a look at the `tiled_decode` method.
-        batch_size, num_channels, num_frames, height, width = x.shape
-        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
-        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
-        blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
-        blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
-        row_limit_height = self.tile_latent_min_height - blend_extent_height
-        row_limit_width = self.tile_latent_min_width - blend_extent_width
-        frame_batch_size = self.num_sample_frames_batch_size
-        # Split x into overlapping tiles and encode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, overlap_height):
-            row = []
-            for j in range(0, width, overlap_width):
-                # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
-                # As the extra single frame is handled inside the loop, it is not required to round up here.
-                num_batches = max(num_frames // frame_batch_size, 1)
-                conv_cache = None
-                time = []
-                for k in range(num_batches):
-                    remaining_frames = num_frames % frame_batch_size
-                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
-                    end_frame = frame_batch_size * (k + 1) + remaining_frames
-                    tile = x[
-                        :,
-                        :,
-                        start_frame:end_frame,
-                        i : i + self.tile_sample_min_height,
-                        j : j + self.tile_sample_min_width,
-                    ]
-                    tile, conv_cache = self.encoder(tile, conv_cache=conv_cache)
-                    if self.quant_conv is not None:
-                        tile = self.quant_conv(tile)
-                    time.append(tile)
-                row.append(torch.cat(time, dim=2))
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
-                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
-            result_rows.append(torch.cat(result_row, dim=4))
-        enc = torch.cat(result_rows, dim=3)
-        return enc
-    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        # Rough memory assessment:
-        #   - In CogVideoX-2B, there are a total of 24 CausalConv3d layers.
-        #   - The biggest intermediate dimensions are: [1, 128, 9, 480, 720].
-        #   - Assume fp16 (2 bytes per value).
-        # Memory required: 1 * 128 * 9 * 480 * 720 * 24 * 2 / 1024**3 = 17.8 GB
-        #
-        # Memory assessment when using tiling:
-        #   - Assume everything as above but now HxW is 240x360 by tiling in half
-        # Memory required: 1 * 128 * 9 * 240 * 360 * 24 * 2 / 1024**3 = 4.5 GB
-        batch_size, num_channels, num_frames, height, width = z.shape
-        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
-        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
-        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
-        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
-        row_limit_height = self.tile_sample_min_height - blend_extent_height
-        row_limit_width = self.tile_sample_min_width - blend_extent_width
-        frame_batch_size = self.num_latent_frames_batch_size
-        # Split z into overlapping tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, overlap_height):
-            row = []
-            for j in range(0, width, overlap_width):
-                if self.auto_split_process:
-                    num_batches = max(num_frames // frame_batch_size, 1)
-                    conv_cache = None
-                    time = []
-                    for k in range(num_batches):
-                        remaining_frames = num_frames % frame_batch_size
-                        start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
-                        end_frame = frame_batch_size * (k + 1) + remaining_frames
-                        tile = z[
-                            :,
-                            :,
-                            start_frame:end_frame,
-                            i : i + self.tile_latent_min_height,
-                            j : j + self.tile_latent_min_width,
-                        ]
-                        if self.post_quant_conv is not None:
-                            tile = self.post_quant_conv(tile)
-                        tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
-                        time.append(tile)
-                    row.append(torch.cat(time, dim=2))
-                else:
-                    conv_cache = None
-                    start_frame = 0
-                    end_frame = 1
-                    dec = []
-                    tile = z[
-                        :,
-                        :,
-                        start_frame:end_frame,
-                        i : i + self.tile_latent_min_height,
-                        j : j + self.tile_latent_min_width,
-                    ]
-                    self._set_first_frame()
-                    if self.post_quant_conv is not None:
-                        tile = self.post_quant_conv(tile)
-                    tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
-                    dec.append(tile)
-                    self._set_rest_frame()
-                    start_frame = end_frame
-                    end_frame += self.num_latent_frames_batch_size
-                    while start_frame < num_frames:
-                        tile = z[
-                            :,
-                            :,
-                            start_frame:end_frame,
-                            i : i + self.tile_latent_min_height,
-                            j : j + self.tile_latent_min_width,
-                        ]
-                        if self.post_quant_conv is not None:
-                            tile = self.post_quant_conv(tile)
-                        tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
-                        dec.append(tile)
-                        start_frame = end_frame
-                        end_frame += self.num_latent_frames_batch_size
-                    row.append(torch.cat(dec, dim=2))
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
-                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
-            result_rows.append(torch.cat(result_row, dim=4))
-        dec = torch.cat(result_rows, dim=3)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[torch.Tensor, torch.Tensor]:
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z)
-        if not return_dict:
-            return (dec,)
-        return dec
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, subfolder=None, **vae_additional_kwargs):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        model = cls.from_config(config, **vae_additional_kwargs)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            if not os.path.isfile(model_file):
-                raise RuntimeError(f"{model_file} does not exist")
-            state_dict = torch.load(model_file, map_location="cpu")
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        return model

videox_fun/models/fantasytalking_audio_encoder.py DELETED Viewed

@@ -1,52 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/s2v/audio_encoder.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import librosa
-import numpy as np
-import torch
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.modeling_utils import ModelMixin
-from transformers import Wav2Vec2Model, Wav2Vec2Processor
-class FantasyTalkingAudioEncoder(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    def __init__(self, pretrained_model_path="facebook/wav2vec2-base-960h", device='cpu'):
-        super(FantasyTalkingAudioEncoder, self).__init__()
-        # load pretrained model
-        self.processor = Wav2Vec2Processor.from_pretrained(pretrained_model_path)
-        self.model = Wav2Vec2Model.from_pretrained(pretrained_model_path)
-        self.model = self.model.to(device)
-    def extract_audio_feat(self, audio_path, num_frames = 81, fps = 16, sr = 16000):
-        audio_input, sample_rate = librosa.load(audio_path, sr=sr)
-        start_time = 0
-        end_time = num_frames / fps
-        start_sample = int(start_time * sr)
-        end_sample = int(end_time * sr)
-        try:
-            audio_segment = audio_input[start_sample:end_sample]
-        except:
-            audio_segment = audio_input
-        input_values = self.processor(
-            audio_segment, sampling_rate=sample_rate, return_tensors="pt"
-        ).input_values.to(self.model.device, self.model.dtype)
-        with torch.no_grad():
-            fea = self.model(input_values).last_hidden_state
-        return fea
-    def extract_audio_feat_without_file_load(self, audio_segment, sample_rate):
-        input_values = self.processor(
-            audio_segment, sampling_rate=sample_rate, return_tensors="pt"
-        ).input_values.to(self.model.device, self.model.dtype)
-        with torch.no_grad():
-            fea = self.model(input_values).last_hidden_state
-        return fea

videox_fun/models/fantasytalking_transformer3d.py DELETED Viewed

@@ -1,644 +0,0 @@
-# Modified from https://github.com/Fantasy-AMAP/fantasy-talking/blob/main/diffsynth/models
-# Copyright Alibaba Inc. All Rights Reserved.
-import math
-import os
-from typing import Any, Dict
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import register_to_config
-from diffusers.utils import is_torch_version
-from ..dist import sequence_parallel_all_gather, sequence_parallel_chunk
-from ..utils import cfg_skip
-from .attention_utils import attention
-from .wan_transformer3d import (WanAttentionBlock, WanLayerNorm, WanRMSNorm,
-                                WanSelfAttention, WanTransformer3DModel,
-                                sinusoidal_embedding_1d)
-class AudioProjModel(nn.Module):
-    def __init__(self, audio_in_dim=1024, cross_attention_dim=1024):
-        super().__init__()
-        self.cross_attention_dim = cross_attention_dim
-        self.proj = torch.nn.Linear(audio_in_dim, cross_attention_dim, bias=False)
-        self.norm = torch.nn.LayerNorm(cross_attention_dim)
-    def forward(self, audio_embeds):
-        context_tokens = self.proj(audio_embeds)
-        context_tokens = self.norm(context_tokens)
-        return context_tokens  # [B,L,C]
-class AudioCrossAttentionProcessor(nn.Module):
-    def __init__(self, context_dim, hidden_dim):
-        super().__init__()
-        self.context_dim = context_dim
-        self.hidden_dim = hidden_dim
-        self.k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
-        self.v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
-        nn.init.zeros_(self.k_proj.weight)
-        nn.init.zeros_(self.v_proj.weight)
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-        self.all_gather = None
-    def __call__(
-        self,
-        attn: nn.Module,
-        x: torch.Tensor,
-        context: torch.Tensor,
-        context_lens: torch.Tensor,
-        audio_proj: torch.Tensor,
-        audio_context_lens: torch.Tensor,
-        latents_num_frames: int = 21,
-        audio_scale: float = 1.0,
-    ) -> torch.Tensor:
-        """
-        x:                  [B, L1, C].
-        context:            [B, L2, C].
-        context_lens:       [B].
-        audio_proj:         [B, 21, L3, C]
-        audio_context_lens: [B*21].
-        """
-        context_img = context[:, :257]
-        context = context[:, 257:]
-        b, n, d = x.size(0), attn.num_heads, attn.head_dim
-        # Compute query, key, value
-        q = attn.norm_q(attn.q(x)).view(b, -1, n, d)
-        k = attn.norm_k(attn.k(context)).view(b, -1, n, d)
-        v = attn.v(context).view(b, -1, n, d)
-        k_img = attn.norm_k_img(attn.k_img(context_img)).view(b, -1, n, d)
-        v_img = attn.v_img(context_img).view(b, -1, n, d)
-        img_x = attention(q, k_img, v_img, k_lens=None)
-        # Compute attention
-        x = attention(q, k, v, k_lens=context_lens)
-        x = x.flatten(2)
-        img_x = img_x.flatten(2)
-        if len(audio_proj.shape) == 4:
-            if self.sp_world_size > 1:
-                q = self.all_gather(q, dim=1)
-                length = int(np.floor(q.size()[1] / latents_num_frames) * latents_num_frames)
-                origin_length = q.size()[1]
-                if origin_length > length:
-                    q_pad = q[:, length:]
-                    q = q[:, :length]
-            audio_q = q.view(b * latents_num_frames, -1, n, d)  # [b, 21, l1, n, d]
-            ip_key = self.k_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
-            ip_value = self.v_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
-            audio_x = attention(
-                audio_q, ip_key, ip_value, k_lens=audio_context_lens, attention_type="NORMAL"
-            )
-            audio_x = audio_x.view(b, q.size(1), n, d)
-            if self.sp_world_size > 1:
-                if origin_length > length:
-                    audio_x = torch.cat([audio_x, q_pad], dim=1)
-                audio_x = torch.chunk(audio_x, self.sp_world_size, dim=1)[self.sp_world_rank]
-            audio_x = audio_x.flatten(2)
-        elif len(audio_proj.shape) == 3:
-            ip_key = self.k_proj(audio_proj).view(b, -1, n, d)
-            ip_value = self.v_proj(audio_proj).view(b, -1, n, d)
-            audio_x = attention(q, ip_key, ip_value, k_lens=audio_context_lens, attention_type="NORMAL")
-            audio_x = audio_x.flatten(2)
-        # Output
-        if isinstance(audio_scale, torch.Tensor):
-            audio_scale = audio_scale[:, None, None]
-        x = x + img_x + audio_x * audio_scale
-        x = attn.o(x)
-        # print(audio_scale)
-        return x
-class AudioCrossAttention(WanSelfAttention):
-    def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True, eps=1e-6):
-        super().__init__(dim, num_heads, window_size, qk_norm, eps)
-        self.k_img = nn.Linear(dim, dim)
-        self.v_img = nn.Linear(dim, dim)
-        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.processor = AudioCrossAttentionProcessor(2048, dim)
-    def forward(
-        self,
-        x,
-        context,
-        context_lens,
-        audio_proj,
-        audio_context_lens,
-        latents_num_frames,
-        audio_scale: float = 1.0,
-        **kwargs,
-    ):
-        """
-        x:              [B, L1, C].
-        context:        [B, L2, C].
-        context_lens:   [B].
-        """
-        if audio_proj is None:
-            return self.processor(self, x, context, context_lens)
-        else:
-            return self.processor(
-                self,
-                x,
-                context,
-                context_lens,
-                audio_proj,
-                audio_context_lens,
-                latents_num_frames,
-                audio_scale,
-            )
-class AudioAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        cross_attn_type, # Useless
-        dim,
-        ffn_dim,
-        num_heads,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=False,
-        eps=1e-6,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # Layers
-        self.norm1 = WanLayerNorm(dim, eps)
-        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm, eps)
-        self.norm3 = (
-            WanLayerNorm(dim, eps, elementwise_affine=True)
-            if cross_attn_norm
-            else nn.Identity()
-        )
-        self.cross_attn = AudioCrossAttention(
-            dim, num_heads, (-1, -1), qk_norm, eps
-        )
-        self.norm2 = WanLayerNorm(dim, eps)
-        self.ffn = nn.Sequential(
-            nn.Linear(dim, ffn_dim),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(ffn_dim, dim),
-        )
-        # Modulation
-        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
-    def forward(
-        self,
-        x,
-        e,
-        seq_lens,
-        grid_sizes,
-        freqs,
-        context,
-        context_lens,
-        audio_proj=None,
-        audio_context_lens=None,
-        audio_scale=1,
-        dtype=torch.bfloat16,
-        t=0,
-    ):
-        assert e.dtype == torch.float32
-        with amp.autocast(dtype=torch.float32):
-            e = (self.modulation.to(dtype=e.dtype, device=e.device) + e).chunk(6, dim=1)
-        assert e[0].dtype == torch.float32
-        # self-attention
-        y = self.self_attn(
-            self.norm1(x).float() * (1 + e[1]) + e[0], seq_lens, grid_sizes, freqs, dtype, t=t
-        )
-        with amp.autocast(dtype=torch.float32):
-            x = x + y * e[2]
-        # Cross-attention & FFN function
-        def cross_attn_ffn(x, context, context_lens, e):
-            x = x + self.cross_attn(
-                self.norm3(x), context, context_lens, dtype=dtype, t=t,
-                audio_proj=audio_proj, audio_context_lens=audio_context_lens, audio_scale=audio_scale,
-                latents_num_frames=grid_sizes[0][0],
-            )
-            y = self.ffn(self.norm2(x).float() * (1 + e[4]) + e[3])
-            with amp.autocast(dtype=torch.float32):
-                x = x + y * e[5]
-            return x
-        x = cross_attn_ffn(x, context, context_lens, e)
-        return x
-class FantasyTalkingTransformer3DModel(WanTransformer3DModel):
-    @register_to_config
-    def __init__(self,
-                 model_type='i2v',
-                 patch_size=(1, 2, 2),
-                 text_len=512,
-                 in_dim=16,
-                 dim=2048,
-                 ffn_dim=8192,
-                 freq_dim=256,
-                 text_dim=4096,
-                 out_dim=16,
-                 num_heads=16,
-                 num_layers=32,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=True,
-                 eps=1e-6,
-                 cross_attn_type=None,
-                 audio_in_dim=768):
-        super().__init__(model_type, patch_size, text_len, in_dim, dim, ffn_dim, freq_dim, text_dim, out_dim,
-                         num_heads, num_layers, window_size, qk_norm, cross_attn_norm, eps)
-        if cross_attn_type is None:
-            cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
-        self.blocks = nn.ModuleList([
-            AudioAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
-                              window_size, qk_norm, cross_attn_norm, eps)
-            for _ in range(num_layers)
-        ])
-        for layer_idx, block in enumerate(self.blocks):
-            block.self_attn.layer_idx = layer_idx
-            block.self_attn.num_layers = self.num_layers
-        self.proj_model = AudioProjModel(audio_in_dim, 2048)
-    def split_audio_sequence(self, audio_proj_length, num_frames=81):
-        """
-        Map the audio feature sequence to corresponding latent frame slices.
-        Args:
-            audio_proj_length (int): The total length of the audio feature sequence
-                                    (e.g., 173 in audio_proj[1, 173, 768]).
-            num_frames (int): The number of video frames in the training data (default: 81).
-        Returns:
-            list: A list of [start_idx, end_idx] pairs. Each pair represents the index range
-                (within the audio feature sequence) corresponding to a latent frame.
-        """
-        # Average number of tokens per original video frame
-        tokens_per_frame = audio_proj_length / num_frames
-        # Each latent frame covers 4 video frames, and we want the center
-        tokens_per_latent_frame = tokens_per_frame * 4
-        half_tokens = int(tokens_per_latent_frame / 2)
-        pos_indices = []
-        for i in range(int((num_frames - 1) / 4) + 1):
-            if i == 0:
-                pos_indices.append(0)
-            else:
-                start_token = tokens_per_frame * ((i - 1) * 4 + 1)
-                end_token = tokens_per_frame * (i * 4 + 1)
-                center_token = int((start_token + end_token) / 2) - 1
-                pos_indices.append(center_token)
-        # Build index ranges centered around each position
-        pos_idx_ranges = [[idx - half_tokens, idx + half_tokens] for idx in pos_indices]
-        # Adjust the first range to avoid negative start index
-        pos_idx_ranges[0] = [
-            -(half_tokens * 2 - pos_idx_ranges[1][0]),
-            pos_idx_ranges[1][0],
-        ]
-        return pos_idx_ranges
-    def split_tensor_with_padding(self, input_tensor, pos_idx_ranges, expand_length=0):
-        """
-        Split the input tensor into subsequences based on index ranges, and apply right-side zero-padding
-        if the range exceeds the input boundaries.
-        Args:
-            input_tensor (Tensor): Input audio tensor of shape [1, L, 768].
-            pos_idx_ranges (list): A list of index ranges, e.g. [[-7, 1], [1, 9], ..., [165, 173]].
-            expand_length (int): Number of tokens to expand on both sides of each subsequence.
-        Returns:
-            sub_sequences (Tensor): A tensor of shape [1, F, L, 768], where L is the length after padding.
-                                    Each element is a padded subsequence.
-            k_lens (Tensor): A tensor of shape [F], representing the actual (unpadded) length of each subsequence.
-                            Useful for ignoring padding tokens in attention masks.
-        """
-        pos_idx_ranges = [
-            [idx[0] - expand_length, idx[1] + expand_length] for idx in pos_idx_ranges
-        ]
-        sub_sequences = []
-        seq_len = input_tensor.size(1)  # 173
-        max_valid_idx = seq_len - 1  # 172
-        k_lens_list = []
-        for start, end in pos_idx_ranges:
-            # Calculate the fill amount
-            pad_front = max(-start, 0)
-            pad_back = max(end - max_valid_idx, 0)
-            # Calculate the start and end indices of the valid part
-            valid_start = max(start, 0)
-            valid_end = min(end, max_valid_idx)
-            # Extract the valid part
-            if valid_start <= valid_end:
-                valid_part = input_tensor[:, valid_start : valid_end + 1, :]
-            else:
-                valid_part = input_tensor.new_zeros((1, 0, input_tensor.size(2)))
-            # In the sequence dimension (the 1st dimension) perform padding
-            padded_subseq = F.pad(
-                valid_part,
-                (0, 0, 0, pad_back + pad_front, 0, 0),
-                mode="constant",
-                value=0,
-            )
-            k_lens_list.append(padded_subseq.size(-2) - pad_back - pad_front)
-            sub_sequences.append(padded_subseq)
-        return torch.stack(sub_sequences, dim=1), torch.tensor(
-            k_lens_list, dtype=torch.long
-        )
-    def enable_multi_gpus_inference(self,):
-        super().enable_multi_gpus_inference()
-        for name, module in self.named_modules():
-            if module.__class__.__name__ == 'AudioCrossAttentionProcessor':
-                module.sp_world_size = self.sp_world_size
-                module.sp_world_rank = self.sp_world_rank
-                module.all_gather = self.all_gather
-    @cfg_skip()
-    def forward(
-        self,
-        x,
-        t,
-        context,
-        seq_len,
-        audio_wav2vec_fea=None,
-        clip_fea=None,
-        y=None,
-        audio_scale=1,
-        cond_flag=True
-    ):
-        r"""
-        Forward pass through the diffusion model
-        Args:
-            x (List[Tensor]):
-                List of input video tensors, each with shape [C_in, F, H, W]
-            t (Tensor):
-                Diffusion timesteps tensor of shape [B]
-            context (List[Tensor]):
-                List of text embeddings each with shape [L, C]
-            seq_len (`int`):
-                Maximum sequence length for positional encoding
-            clip_fea (Tensor, *optional*):
-                CLIP image features for image-to-video mode
-            y (List[Tensor], *optional*):
-                Conditional video inputs for image-to-video mode, same shape as x
-        Returns:
-            List[Tensor]:
-                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
-        """
-        # Wan2.2 don't need a clip.
-        # if self.model_type == 'i2v':
-        #     assert clip_fea is not None and y is not None
-        # params
-        device = self.patch_embedding.weight.device
-        dtype = x.dtype
-        if self.freqs.device != device and torch.device(type="meta") != device:
-            self.freqs = self.freqs.to(device)
-        if y is not None:
-            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        if self.sp_world_size > 1:
-            seq_len = int(math.ceil(seq_len / self.sp_world_size)) * self.sp_world_size
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in x
-        ])
-        # time embeddings
-        with amp.autocast(dtype=torch.float32):
-            if t.dim() != 1:
-                if t.size(1) < seq_len:
-                    pad_size = seq_len - t.size(1)
-                    last_elements = t[:, -1].unsqueeze(1)
-                    padding = last_elements.repeat(1, pad_size)
-                    t = torch.cat([t, padding], dim=1)
-                bt = t.size(0)
-                ft = t.flatten()
-                e = self.time_embedding(
-                    sinusoidal_embedding_1d(self.freq_dim,
-                                            ft).unflatten(0, (bt, seq_len)).float())
-                e0 = self.time_projection(e).unflatten(2, (6, self.dim))
-            else:
-                e = self.time_embedding(
-                    sinusoidal_embedding_1d(self.freq_dim, t).float())
-                e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-            # assert e.dtype == torch.float32 and e0.dtype == torch.float32
-            # e0 = e0.to(dtype)
-            # e = e.to(dtype)
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        if clip_fea is not None:
-            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-            context = torch.concat([context_clip, context], dim=1)
-        num_frames = (grid_sizes[0][0] - 1) * 4 + 1
-        audio_proj_fea = self.proj_model(audio_wav2vec_fea)
-        pos_idx_ranges = self.split_audio_sequence(audio_proj_fea.size(1), num_frames=num_frames)
-        audio_proj, audio_context_lens = self.split_tensor_with_padding(
-            audio_proj_fea, pos_idx_ranges, expand_length=4
-        )
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = torch.chunk(x, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if t.dim() != 1:
-                e0 = torch.chunk(e0, self.sp_world_size, dim=1)[self.sp_world_rank]
-                e = torch.chunk(e, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                if t.dim() != 1:
-                    modulated_inp = e0[:, -1, :]
-                else:
-                    modulated_inp = e0
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                x = x + previous_residual.to(x.device)[-x.size()[0]:,]
-            else:
-                ori_x = x.clone().cpu() if self.teacache.offload else x.clone()
-                for block in self.blocks:
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module):
-                            def custom_forward(*inputs):
-                                return module(*inputs)
-                            return custom_forward
-                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        x = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block),
-                            x,
-                            e0,
-                            seq_lens,
-                            grid_sizes,
-                            self.freqs,
-                            context,
-                            context_lens,
-                            audio_proj,
-                            audio_context_lens,
-                            audio_scale,
-                            dtype,
-                            t,
-                            **ckpt_kwargs,
-                        )
-                    else:
-                        # arguments
-                        kwargs = dict(
-                            e=e0,
-                            seq_lens=seq_lens,
-                            grid_sizes=grid_sizes,
-                            freqs=self.freqs,
-                            context=context,
-                            context_lens=context_lens,
-                            audio_proj=audio_proj,
-                            audio_context_lens=audio_context_lens,
-                            audio_scale=audio_scale,
-                            dtype=dtype,
-                            t=t
-                        )
-                        x = block(x, **kwargs)
-                if cond_flag:
-                    self.teacache.previous_residual_cond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-                else:
-                    self.teacache.previous_residual_uncond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-        else:
-            for block in self.blocks:
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs)
-                        return custom_forward
-                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        x,
-                        e0,
-                        seq_lens,
-                        grid_sizes,
-                        self.freqs,
-                        context,
-                        context_lens,
-                        audio_proj,
-                        audio_context_lens,
-                        audio_scale,
-                        dtype,
-                        t,
-                        **ckpt_kwargs,
-                    )
-                else:
-                    # arguments
-                    kwargs = dict(
-                        e=e0,
-                        seq_lens=seq_lens,
-                        grid_sizes=grid_sizes,
-                        freqs=self.freqs,
-                        context=context,
-                        context_lens=context_lens,
-                        audio_proj=audio_proj,
-                        audio_context_lens=audio_context_lens,
-                        audio_scale=audio_scale,
-                        dtype=dtype,
-                        t=t
-                    )
-                    x = block(x, **kwargs)
-        # head
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.head), x, e, **ckpt_kwargs)
-        else:
-            x = self.head(x, e)
-        if self.sp_world_size > 1:
-            x = self.all_gather(x, dim=1)
-        # Unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        x = torch.stack(x)
-        if self.teacache is not None and cond_flag:
-            self.teacache.cnt += 1
-            if self.teacache.cnt == self.teacache.num_steps:
-                self.teacache.reset()
-        return x

videox_fun/models/flux2_image_processor.py DELETED Viewed

@@ -1,139 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/flux2/image_processor.py
-# Copyright 2025 The Black Forest Labs Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Tuple
-import PIL.Image
-from diffusers.configuration_utils import register_to_config
-from diffusers.image_processor import VaeImageProcessor
-class Flux2ImageProcessor(VaeImageProcessor):
-    r"""
-    Image processor to preprocess the reference (character) image for the Flux2 model.
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
-            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
-        vae_scale_factor (`int`, *optional*, defaults to `16`):
-            VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
-            this factor.
-        vae_latent_channels (`int`, *optional*, defaults to `32`):
-            VAE latent channels.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image to [-1,1].
-        do_convert_rgb (`bool`, *optional*, defaults to be `True`):
-            Whether to convert the images to RGB format.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 16,
-        vae_latent_channels: int = 32,
-        do_normalize: bool = True,
-        do_convert_rgb: bool = True,
-    ):
-        super().__init__(
-            do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor,
-            vae_latent_channels=vae_latent_channels,
-            do_normalize=do_normalize,
-            do_convert_rgb=do_convert_rgb,
-        )
-    @staticmethod
-    def check_image_input(
-        image: PIL.Image.Image, max_aspect_ratio: int = 8, min_side_length: int = 64, max_area: int = 1024 * 1024
-    ) -> PIL.Image.Image:
-        """
-        Check if image meets minimum size and aspect ratio requirements.
-        Args:
-            image: PIL Image to validate
-            max_aspect_ratio: Maximum allowed aspect ratio (width/height or height/width)
-            min_side_length: Minimum pixels required for width and height
-            max_area: Maximum allowed area in pixels²
-        Returns:
-            The input image if valid
-        Raises:
-            ValueError: If image is too small or aspect ratio is too extreme
-        """
-        if not isinstance(image, PIL.Image.Image):
-            raise ValueError(f"Image must be a PIL.Image.Image, got {type(image)}")
-        width, height = image.size
-        # Check minimum dimensions
-        if width < min_side_length or height < min_side_length:
-            raise ValueError(
-                f"Image too small: {width}×{height}. Both dimensions must be at least {min_side_length}px"
-            )
-        # Check aspect ratio
-        aspect_ratio = max(width / height, height / width)
-        if aspect_ratio > max_aspect_ratio:
-            raise ValueError(
-                f"Aspect ratio too extreme: {width}×{height} (ratio: {aspect_ratio:.1f}:1). "
-                f"Maximum allowed ratio is {max_aspect_ratio}:1"
-            )
-        return image
-    @staticmethod
-    def _resize_to_target_area(image: PIL.Image.Image, target_area: int = 1024 * 1024) -> Tuple[int, int]:
-        image_width, image_height = image.size
-        scale = math.sqrt(target_area / (image_width * image_height))
-        width = int(image_width * scale)
-        height = int(image_height * scale)
-        return image.resize((width, height), PIL.Image.Resampling.LANCZOS)
-    def _resize_and_crop(
-        self,
-        image: PIL.Image.Image,
-        width: int,
-        height: int,
-    ) -> PIL.Image.Image:
-        r"""
-        center crop the image to the specified width and height.
-        Args:
-            image (`PIL.Image.Image`):
-                The image to resize and crop.
-            width (`int`):
-                The width to resize the image to.
-            height (`int`):
-                The height to resize the image to.
-        Returns:
-            `PIL.Image.Image`:
-                The resized and cropped image.
-        """
-        image_width, image_height = image.size
-        left = (image_width - width) // 2
-        top = (image_height - height) // 2
-        right = left + width
-        bottom = top + height
-        return image.crop((left, top, right, bottom))

videox_fun/models/flux2_transformer2d.py DELETED Viewed

@@ -1,1278 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_flux2.py
-# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import inspect
-import json
-import os
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin
-from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.models.embeddings import (TimestepEmbedding, Timesteps,
-                                         apply_rotary_emb,
-                                         get_1d_rotary_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormContinuous
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_npu_available,
-                             is_torch_version, logging, scale_lora_layers,
-                             unscale_lora_layers)
-from ..dist import (Flux2MultiGPUsAttnProcessor2_0, get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group)
-from .attention_utils import attention
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def _get_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
-    query = attn.to_q(hidden_states)
-    key = attn.to_k(hidden_states)
-    value = attn.to_v(hidden_states)
-    encoder_query = encoder_key = encoder_value = None
-    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-    return query, key, value, encoder_query, encoder_key, encoder_value
-def _get_qkv_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
-    return _get_projections(attn, hidden_states, encoder_hidden_states)
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-    sequence_dim: int = 2,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        if sequence_dim == 2:
-            cos = cos[None, None, :, :]
-            sin = sin[None, None, :, :]
-        elif sequence_dim == 1:
-            cos = cos[None, :, None, :]
-            sin = sin[None, :, None, :]
-        else:
-            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        # used for lumina
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class Flux2SwiGLU(nn.Module):
-    """
-    Flux 2 uses a SwiGLU-style activation in the transformer feedforward sub-blocks, but with the linear projection
-    layer fused into the first linear layer of the FF sub-block. Thus, this module has no trainable parameters.
-    """
-    def __init__(self):
-        super().__init__()
-        self.gate_fn = nn.SiLU()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x1, x2 = x.chunk(2, dim=-1)
-        x = self.gate_fn(x1) * x2
-        return x
-class Flux2FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: float = 3.0,
-        inner_dim: Optional[int] = None,
-        bias: bool = False,
-    ):
-        super().__init__()
-        if inner_dim is None:
-            inner_dim = int(dim * mult)
-        dim_out = dim_out or dim
-        # Flux2SwiGLU will reduce the dimension by half
-        self.linear_in = nn.Linear(dim, inner_dim * 2, bias=bias)
-        self.act_fn = Flux2SwiGLU()
-        self.linear_out = nn.Linear(inner_dim, dim_out, bias=bias)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.linear_in(x)
-        x = self.act_fn(x)
-        x = self.linear_out(x)
-        return x
-class Flux2AttnProcessor:
-    _attention_backend = None
-    _parallel_config = None
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
-    def __call__(
-        self,
-        attn: Union["Flux2Attention", "Flux2ParallelSelfAttention"],
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        text_seq_len: int = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Unified processor for both Flux2Attention and Flux2ParallelSelfAttention.
-        Args:
-            attn: Attention module (either Flux2Attention or Flux2ParallelSelfAttention)
-            hidden_states: Input hidden states
-            encoder_hidden_states: Optional encoder hidden states (only for Flux2Attention)
-            attention_mask: Optional attention mask
-            image_rotary_emb: Optional rotary embeddings
-        Returns:
-            For Flux2Attention with encoder_hidden_states: (hidden_states, encoder_hidden_states)
-            For Flux2Attention without encoder_hidden_states: hidden_states
-            For Flux2ParallelSelfAttention: hidden_states
-        """
-        # Determine which type of attention we're processing
-        is_parallel_self_attn = hasattr(attn, 'to_qkv_mlp_proj') and attn.to_qkv_mlp_proj is not None
-        if is_parallel_self_attn:
-            # ============================================
-            # Parallel Self-Attention Path (with MLP)
-            # ============================================
-            # Parallel in (QKV + MLP in) projection
-            hidden_states = attn.to_qkv_mlp_proj(hidden_states)
-            qkv, mlp_hidden_states = torch.split(
-                hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
-            )
-            # Handle the attention logic
-            query, key, value = qkv.chunk(3, dim=-1)
-        else:
-            # ============================================
-            # Standard Attention Path (possibly with encoder)
-            # ============================================
-            query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
-                attn, hidden_states, encoder_hidden_states
-            )
-        # Common processing for query, key, value
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        # Handle encoder projections (only for standard attention)
-        if not is_parallel_self_attn and attn.added_kv_proj_dim is not None:
-            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
-            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
-            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
-            encoder_query = attn.norm_added_q(encoder_query)
-            encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([encoder_query, query], dim=1)
-            key = torch.cat([encoder_key, key], dim=1)
-            value = torch.cat([encoder_value, value], dim=1)
-        # Apply rotary embeddings
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
-            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
-        # Perform attention
-        hidden_states = attention(
-            query, key, value, attn_mask=attention_mask,
-        )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        if is_parallel_self_attn:
-            # ============================================
-            # Parallel Self-Attention Output Path
-            # ============================================
-            # Handle the feedforward (FF) logic
-            mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
-            # Concatenate and parallel output projection
-            hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
-            hidden_states = attn.to_out(hidden_states)
-            return hidden_states
-        else:
-            # ============================================
-            # Standard Attention Output Path
-            # ============================================
-            # Split encoder and latent hidden states if encoder was used
-            if encoder_hidden_states is not None:
-                encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
-                    [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
-                )
-                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-            # Project output
-            hidden_states = attn.to_out[0](hidden_states)
-            hidden_states = attn.to_out[1](hidden_states)
-            if encoder_hidden_states is not None:
-                return hidden_states, encoder_hidden_states
-            else:
-                return hidden_states
-class Flux2Attention(torch.nn.Module):
-    _default_processor_cls = Flux2AttnProcessor
-    _available_processors = [Flux2AttnProcessor]
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        added_kv_proj_dim: Optional[int] = None,
-        added_proj_bias: Optional[bool] = True,
-        out_bias: bool = True,
-        eps: float = 1e-5,
-        out_dim: int = None,
-        elementwise_affine: bool = True,
-        processor=None,
-    ):
-        super().__init__()
-        self.head_dim = dim_head
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.use_bias = bias
-        self.dropout = dropout
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.added_proj_bias = added_proj_bias
-        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        # QK Norm
-        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.to_out = torch.nn.ModuleList([])
-        self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
-        self.to_out.append(torch.nn.Dropout(dropout))
-        if added_kv_proj_dim is not None:
-            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
-        if processor is None:
-            processor = self._default_processor_cls()
-        self.set_processor(processor)
-    def set_processor(self, processor: AttentionProcessor) -> None:
-        """
-        Set the attention processor to use.
-        Args:
-            processor (`AttnProcessor`):
-                The attention processor to use.
-        """
-        # if current processor is in `self._modules` and if passed `processor` is not, we need to
-        # pop `processor` from `self._modules`
-        if (
-            hasattr(self, "processor")
-            and isinstance(self.processor, torch.nn.Module)
-            and not isinstance(processor, torch.nn.Module)
-        ):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
-            self._modules.pop("processor")
-        self.processor = processor
-    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
-        """
-        Get the attention processor in use.
-        Args:
-            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
-                Set to `True` to return the deprecated LoRA attention processor.
-        Returns:
-            "AttentionProcessor": The attention processor in use.
-        """
-        if not return_deprecated_lora:
-            return self.processor
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters]
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
-            )
-        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
-        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
-class Flux2ParallelSelfAttention(torch.nn.Module):
-    """
-    Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.
-    This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
-    input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
-    paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
-    """
-    _default_processor_cls = Flux2AttnProcessor
-    _available_processors = [Flux2AttnProcessor]
-    # Does not support QKV fusion as the QKV projections are always fused
-    _supports_qkv_fusion = False
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        out_bias: bool = True,
-        eps: float = 1e-5,
-        out_dim: int = None,
-        elementwise_affine: bool = True,
-        mlp_ratio: float = 4.0,
-        mlp_mult_factor: int = 2,
-        processor=None,
-    ):
-        super().__init__()
-        self.head_dim = dim_head
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.use_bias = bias
-        self.dropout = dropout
-        self.mlp_ratio = mlp_ratio
-        self.mlp_hidden_dim = int(query_dim * self.mlp_ratio)
-        self.mlp_mult_factor = mlp_mult_factor
-        # Fused QKV projections + MLP input projection
-        self.to_qkv_mlp_proj = torch.nn.Linear(
-            self.query_dim, self.inner_dim * 3 + self.mlp_hidden_dim * self.mlp_mult_factor, bias=bias
-        )
-        self.mlp_act_fn = Flux2SwiGLU()
-        # QK Norm
-        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        # Fused attention output projection + MLP output projection
-        self.to_out = torch.nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias)
-        if processor is None:
-            processor = self._default_processor_cls()
-        self.set_processor(processor)
-    def set_processor(self, processor: AttentionProcessor) -> None:
-        """
-        Set the attention processor to use.
-        Args:
-            processor (`AttnProcessor`):
-                The attention processor to use.
-        """
-        # if current processor is in `self._modules` and if passed `processor` is not, we need to
-        # pop `processor` from `self._modules`
-        if (
-            hasattr(self, "processor")
-            and isinstance(self.processor, torch.nn.Module)
-            and not isinstance(processor, torch.nn.Module)
-        ):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
-            self._modules.pop("processor")
-        self.processor = processor
-    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
-        """
-        Get the attention processor in use.
-        Args:
-            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
-                Set to `True` to return the deprecated LoRA attention processor.
-        Returns:
-            "AttentionProcessor": The attention processor in use.
-        """
-        if not return_deprecated_lora:
-            return self.processor
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters]
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
-            )
-        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
-        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
-class Flux2SingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 3.0,
-        eps: float = 1e-6,
-        bias: bool = False,
-    ):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        # Note that the MLP in/out linear layers are fused with the attention QKV/out projections, respectively; this
-        # is often called a "parallel" transformer block. See the [ViT-22B paper](https://arxiv.org/abs/2302.05442)
-        # for a visual depiction of this type of transformer block.
-        self.attn = Flux2ParallelSelfAttention(
-            query_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=bias,
-            out_bias=bias,
-            eps=eps,
-            mlp_ratio=mlp_ratio,
-            mlp_mult_factor=2,
-            processor=Flux2AttnProcessor(),
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
-        temb_mod_params: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # If encoder_hidden_states is None, hidden_states is assumed to have encoder_hidden_states already
-        # concatenated
-        if encoder_hidden_states is not None:
-            text_seq_len = encoder_hidden_states.shape[1]
-            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-        mod_shift, mod_scale, mod_gate = temb_mod_params
-        norm_hidden_states = self.norm(hidden_states)
-        norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            text_seq_len=text_seq_len,
-            **joint_attention_kwargs,
-        )
-        hidden_states = hidden_states + mod_gate * attn_output
-        if hidden_states.dtype == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-        encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
-        return encoder_hidden_states, hidden_states
-class Flux2TransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 3.0,
-        eps: float = 1e-6,
-        bias: bool = False,
-    ):
-        super().__init__()
-        self.mlp_hidden_dim = int(dim * mlp_ratio)
-        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.norm1_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.attn = Flux2Attention(
-            query_dim=dim,
-            added_kv_proj_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=bias,
-            added_proj_bias=bias,
-            out_bias=bias,
-            eps=eps,
-            processor=Flux2AttnProcessor(),
-        )
-        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.ff = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
-        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.ff_context = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb_mod_params_img: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
-        temb_mod_params_txt: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        # Modulation parameters shape: [1, 1, self.dim]
-        (shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
-        (c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
-        # Img stream
-        norm_hidden_states = self.norm1(hidden_states)
-        norm_hidden_states = (1 + scale_msa) * norm_hidden_states + shift_msa
-        # Conditioning txt stream
-        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
-        norm_encoder_hidden_states = (1 + c_scale_msa) * norm_encoder_hidden_states + c_shift_msa
-        # Attention on concatenated img + txt stream
-        attention_outputs = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            **joint_attention_kwargs,
-        )
-        attn_output, context_attn_output = attention_outputs
-        # Process attention outputs for the image stream (`hidden_states`).
-        attn_output = gate_msa * attn_output
-        hidden_states = hidden_states + attn_output
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-        ff_output = self.ff(norm_hidden_states)
-        hidden_states = hidden_states + gate_mlp * ff_output
-        # Process attention outputs for the text stream (`encoder_hidden_states`).
-        context_attn_output = c_gate_msa * context_attn_output
-        encoder_hidden_states = encoder_hidden_states + context_attn_output
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
-        if encoder_hidden_states.dtype == torch.float16:
-            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-        return encoder_hidden_states, hidden_states
-class Flux2PosEmbed(nn.Module):
-    # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        # Expected ids shape: [S, len(self.axes_dim)]
-        cos_out = []
-        sin_out = []
-        pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        is_npu = ids.device.type == "npu"
-        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-        # Unlike Flux 1, loop over len(self.axes_dim) rather than ids.shape[-1]
-        for i in range(len(self.axes_dim)):
-            cos, sin = get_1d_rotary_pos_embed(
-                self.axes_dim[i],
-                pos[..., i],
-                theta=self.theta,
-                repeat_interleave_real=True,
-                use_real=True,
-                freqs_dtype=freqs_dtype,
-            )
-            cos_out.append(cos)
-            sin_out.append(sin)
-        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
-        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
-        return freqs_cos, freqs_sin
-class Flux2TimestepGuidanceEmbeddings(nn.Module):
-    def __init__(self, in_channels: int = 256, embedding_dim: int = 6144, bias: bool = False):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(
-            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
-        )
-        self.guidance_embedder = TimestepEmbedding(
-            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
-        )
-    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> torch.Tensor:
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype))  # (N, D)
-        guidance_proj = self.time_proj(guidance)
-        guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype))  # (N, D)
-        time_guidance_emb = timesteps_emb + guidance_emb
-        return time_guidance_emb
-class Flux2Modulation(nn.Module):
-    def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = False):
-        super().__init__()
-        self.mod_param_sets = mod_param_sets
-        self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
-        self.act_fn = nn.SiLU()
-    def forward(self, temb: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
-        mod = self.act_fn(temb)
-        mod = self.linear(mod)
-        if mod.ndim == 2:
-            mod = mod.unsqueeze(1)
-        mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
-        # Return tuple of 3-tuples of modulation params shift/scale/gate
-        return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
-class Flux2Transformer2DModel(
-    ModelMixin,
-    ConfigMixin,
-    FromOriginalModelMixin,
-):
-    """
-    The Transformer model introduced in Flux 2.
-    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
-    Args:
-        patch_size (`int`, defaults to `1`):
-            Patch size to turn the input data into small patches.
-        in_channels (`int`, defaults to `128`):
-            The number of channels in the input.
-        out_channels (`int`, *optional*, defaults to `None`):
-            The number of channels in the output. If not specified, it defaults to `in_channels`.
-        num_layers (`int`, defaults to `8`):
-            The number of layers of dual stream DiT blocks to use.
-        num_single_layers (`int`, defaults to `48`):
-            The number of layers of single stream DiT blocks to use.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of dimensions to use for each attention head.
-        num_attention_heads (`int`, defaults to `48`):
-            The number of attention heads to use.
-        joint_attention_dim (`int`, defaults to `15360`):
-            The number of dimensions to use for the joint attention (embedding/channel dimension of
-            `encoder_hidden_states`).
-        pooled_projection_dim (`int`, defaults to `768`):
-            The number of dimensions to use for the pooled projection.
-        guidance_embeds (`bool`, defaults to `True`):
-            Whether to use guidance embeddings for guidance-distilled variant of the model.
-        axes_dims_rope (`Tuple[int]`, defaults to `(32, 32, 32, 32)`):
-            The dimensions to use for the rotary positional embeddings.
-    """
-    _supports_gradient_checkpointing = True
-    # _no_split_modules = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
-    # _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
-    # _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
-    @register_to_config
-    def __init__(
-        self,
-        patch_size: int = 1,
-        in_channels: int = 128,
-        out_channels: Optional[int] = None,
-        num_layers: int = 8,
-        num_single_layers: int = 48,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 48,
-        joint_attention_dim: int = 15360,
-        timestep_guidance_channels: int = 256,
-        mlp_ratio: float = 3.0,
-        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
-        rope_theta: int = 2000,
-        eps: float = 1e-6,
-    ):
-        super().__init__()
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
-        # 1. Sinusoidal positional embedding for RoPE on image and text tokens
-        self.pos_embed = Flux2PosEmbed(theta=rope_theta, axes_dim=axes_dims_rope)
-        # 2. Combined timestep + guidance embedding
-        self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
-            in_channels=timestep_guidance_channels, embedding_dim=self.inner_dim, bias=False
-        )
-        # 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
-        # Two sets of shift/scale/gate modulation parameters for the double stream attn and FF sub-blocks
-        self.double_stream_modulation_img = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
-        self.double_stream_modulation_txt = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
-        # Only one set of modulation parameters as the attn and FF sub-blocks are run in parallel for single stream
-        self.single_stream_modulation = Flux2Modulation(self.inner_dim, mod_param_sets=1, bias=False)
-        # 4. Input projections
-        self.x_embedder = nn.Linear(in_channels, self.inner_dim, bias=False)
-        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim, bias=False)
-        # 5. Double Stream Transformer Blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                Flux2TransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    eps=eps,
-                    bias=False,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        # 6. Single Stream Transformer Blocks
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                Flux2SingleTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    eps=eps,
-                    bias=False,
-                )
-                for _ in range(num_single_layers)
-            ]
-        )
-        # 7. Output layers
-        self.norm_out = AdaLayerNormContinuous(
-            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=eps, bias=False
-        )
-        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
-        self.gradient_checkpointing = False
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        self.set_attn_processor(Flux2MultiGPUsAttnProcessor2_0())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        timestep: torch.LongTensor = None,
-        img_ids: torch.Tensor = None,
-        txt_ids: torch.Tensor = None,
-        guidance: torch.Tensor = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
-        """
-        The [`FluxTransformer2DModel`] forward method.
-        Args:
-            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
-                Input `hidden_states`.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-                A list of tensors that if specified are added to the residuals of transformer blocks.
-            joint_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        # 0. Handle input arguments
-        if joint_attention_kwargs is not None:
-            joint_attention_kwargs = joint_attention_kwargs.copy()
-            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        num_txt_tokens = encoder_hidden_states.shape[1]
-        # 1. Calculate timestep embedding and modulation parameters
-        timestep = timestep.to(hidden_states.dtype) * 1000
-        guidance = guidance.to(hidden_states.dtype) * 1000
-        temb = self.time_guidance_embed(timestep, guidance)
-        double_stream_mod_img = self.double_stream_modulation_img(temb)
-        double_stream_mod_txt = self.double_stream_modulation_txt(temb)
-        single_stream_mod = self.single_stream_modulation(temb)[0]
-        # 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
-        hidden_states = self.x_embedder(hidden_states)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-        # 3. Calculate RoPE embeddings from image and text tokens
-        # NOTE: the below logic means that we can't support batched inference with images of different resolutions or
-        # text prompts of differents lengths. Is this a use case we want to support?
-        if img_ids.ndim == 3:
-            img_ids = img_ids[0]
-        if txt_ids.ndim == 3:
-            txt_ids = txt_ids[0]
-        if is_torch_npu_available():
-            freqs_cos_image, freqs_sin_image = self.pos_embed(img_ids.cpu())
-            image_rotary_emb = (freqs_cos_image.npu(), freqs_sin_image.npu())
-            freqs_cos_text, freqs_sin_text = self.pos_embed(txt_ids.cpu())
-            text_rotary_emb = (freqs_cos_text.npu(), freqs_sin_text.npu())
-        else:
-            image_rotary_emb = self.pos_embed(img_ids)
-            text_rotary_emb = self.pos_embed(txt_ids)
-        concat_rotary_emb = (
-            torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0),
-            torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0),
-        )
-        # Context Parallel
-        if self.sp_world_size > 1:
-            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if concat_rotary_emb is not None:
-                txt_rotary_emb = (
-                    concat_rotary_emb[0][:encoder_hidden_states.shape[1]],
-                    concat_rotary_emb[1][:encoder_hidden_states.shape[1]]
-                )
-                concat_rotary_emb = (
-                    torch.chunk(concat_rotary_emb[0][encoder_hidden_states.shape[1]:], self.sp_world_size, dim=0)[self.sp_world_rank],
-                    torch.chunk(concat_rotary_emb[1][encoder_hidden_states.shape[1]:], self.sp_world_size, dim=0)[self.sp_world_rank],
-                )
-                concat_rotary_emb = [torch.cat([_txt_rotary_emb, _image_rotary_emb], dim=0) \
-                    for _txt_rotary_emb, _image_rotary_emb in zip(txt_rotary_emb, concat_rotary_emb)]
-        # 4. Double Stream Transformer Blocks
-        for index_block, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    double_stream_mod_img,
-                    double_stream_mod_txt,
-                    concat_rotary_emb,
-                    joint_attention_kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb_mod_params_img=double_stream_mod_img,
-                    temb_mod_params_txt=double_stream_mod_txt,
-                    image_rotary_emb=concat_rotary_emb,
-                    joint_attention_kwargs=joint_attention_kwargs,
-                )
-        # 5. Single Stream Transformer Blocks
-        for index_block, block in enumerate(self.single_transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    single_stream_mod,
-                    concat_rotary_emb,
-                    joint_attention_kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb_mod_params=single_stream_mod,
-                    image_rotary_emb=concat_rotary_emb,
-                    joint_attention_kwargs=joint_attention_kwargs,
-                )
-        # 6. Output layers
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-        if self.sp_world_size > 1:
-            output = self.all_gather(output, dim=1)
-        if not return_dict:
-            return (output,)
-        return Transformer2DModelOutput(sample=output)
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    print(model_files_safetensors)
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                filtered_state_dict = {}
-                for key in state_dict:
-                    if key in model.state_dict() and model.state_dict()[key].size() == state_dict[key].size():
-                        filtered_state_dict[key] = state_dict[key]
-                    else:
-                        print(f"Skipping key '{key}' due to size mismatch or absence in model.")
-                model_keys = set(model.state_dict().keys())
-                loaded_keys = set(filtered_state_dict.keys())
-                missing_keys = model_keys - loaded_keys
-                def initialize_missing_parameters(missing_keys, model_state_dict, torch_dtype=None):
-                    initialized_dict = {}
-                    with torch.no_grad():
-                        for key in missing_keys:
-                            param_shape = model_state_dict[key].shape
-                            param_dtype = torch_dtype if torch_dtype is not None else model_state_dict[key].dtype
-                            if 'weight' in key:
-                                if any(norm_type in key for norm_type in ['norm', 'ln_', 'layer_norm', 'group_norm', 'batch_norm']):
-                                    initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                                elif 'embedding' in key or 'embed' in key:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                                elif 'head' in key or 'output' in key or 'proj_out' in key:
-                                    initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                elif len(param_shape) >= 2:
-                                    initialized_dict[key] = torch.empty(param_shape, dtype=param_dtype)
-                                    nn.init.xavier_uniform_(initialized_dict[key])
-                                else:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                            elif 'bias' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_mean' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_var' in key:
-                                initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                            elif 'num_batches_tracked' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=torch.long)
-                            else:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                    return initialized_dict
-                if missing_keys:
-                    print(f"Missing keys will be initialized: {sorted(missing_keys)}")
-                    initialized_params = initialize_missing_parameters(
-                        missing_keys,
-                        model.state_dict(),
-                        torch_dtype
-                    )
-                    filtered_state_dict.update(initialized_params)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    model._convert_deprecated_attention_blocks(filtered_state_dict)
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model

videox_fun/models/flux2_transformer2d_control.py DELETED Viewed

@@ -1,312 +0,0 @@
-# Modified from https://github.com/ali-vilab/VACE/blob/main/control/models/wan/wan_control.py
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import glob
-import inspect
-import json
-import os
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin
-from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.models.embeddings import (TimestepEmbedding, Timesteps,
-                                         apply_rotary_emb,
-                                         get_1d_rotary_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormContinuous
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_npu_available,
-                             is_torch_version, logging, scale_lora_layers,
-                             unscale_lora_layers)
-from .flux2_transformer2d import (Flux2SingleTransformerBlock,
-                                  Flux2Transformer2DModel,
-                                  Flux2TransformerBlock)
-class Flux2ControlTransformerBlock(Flux2TransformerBlock):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 3.0,
-        eps: float = 1e-6,
-        bias: bool = False,
-        block_id=0
-    ):
-        super().__init__(dim, num_attention_heads, attention_head_dim, mlp_ratio, eps, bias)
-        self.block_id = block_id
-        if block_id == 0:
-            self.before_proj = nn.Linear(dim, dim)
-            nn.init.zeros_(self.before_proj.weight)
-            nn.init.zeros_(self.before_proj.bias)
-        self.after_proj = nn.Linear(dim, dim)
-        nn.init.zeros_(self.after_proj.weight)
-        nn.init.zeros_(self.after_proj.bias)
-    def forward(self, c, x, **kwargs):
-        if self.block_id == 0:
-            c = self.before_proj(c) + x
-            all_c = []
-        else:
-            all_c = list(torch.unbind(c))
-            c = all_c.pop(-1)
-        encoder_hidden_states, c = super().forward(c, **kwargs)
-        c_skip = self.after_proj(c)
-        all_c += [c_skip, c]
-        c = torch.stack(all_c)
-        return encoder_hidden_states, c
-class BaseFlux2TransformerBlock(Flux2TransformerBlock):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 3.0,
-        eps: float = 1e-6,
-        bias: bool = False,
-        block_id=0
-    ):
-        super().__init__(dim, num_attention_heads, attention_head_dim, mlp_ratio, eps, bias)
-        self.block_id = block_id
-    def forward(self, hidden_states, hints=None, context_scale=1.0, **kwargs):
-        encoder_hidden_states, hidden_states = super().forward(hidden_states, **kwargs)
-        if self.block_id is not None:
-            hidden_states = hidden_states + hints[self.block_id] * context_scale
-        return encoder_hidden_states, hidden_states
-class Flux2ControlTransformer2DModel(Flux2Transformer2DModel):
-    @register_to_config
-    def __init__(
-        self,
-        control_layers=None,
-        control_in_dim=None,
-        patch_size: int = 1,
-        in_channels: int = 128,
-        out_channels: Optional[int] = None,
-        num_layers: int = 8,
-        num_single_layers: int = 48,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 48,
-        joint_attention_dim: int = 15360,
-        timestep_guidance_channels: int = 256,
-        mlp_ratio: float = 3.0,
-        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
-        rope_theta: int = 2000,
-        eps: float = 1e-6,
-    ):
-        super().__init__(
-            patch_size, in_channels, out_channels, num_layers, num_single_layers, attention_head_dim,
-            num_attention_heads, joint_attention_dim, timestep_guidance_channels, mlp_ratio, axes_dims_rope,
-            rope_theta, eps
-        )
-        self.control_layers = [i for i in range(0, self.num_layers, 2)] if control_layers is None else control_layers
-        self.control_in_dim = self.in_dim if control_in_dim is None else control_in_dim
-        assert 0 in self.control_layers
-        self.control_layers_mapping = {i: n for n, i in enumerate(self.control_layers)}
-        # blocks
-        del self.transformer_blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BaseFlux2TransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    eps=eps,
-                    block_id=self.control_layers_mapping[i] if i in self.control_layers else None
-                )
-                for i in range(num_layers)
-            ]
-        )
-        # control blocks
-        self.control_transformer_blocks = nn.ModuleList(
-            [
-                Flux2ControlTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_ratio=mlp_ratio,
-                    eps=eps,
-                    block_id=i
-                )
-                for i in self.control_layers
-            ]
-        )
-        # control patch embeddings
-        self.control_img_in = nn.Linear(self.control_in_dim, self.inner_dim)
-    def forward_control(
-        self,
-        x,
-        control_context,
-        kwargs
-    ):
-        # embeddings
-        c = self.control_img_in(control_context)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            c = torch.chunk(c, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # arguments
-        new_kwargs = dict(x=x)
-        new_kwargs.update(kwargs)
-        for block in self.control_transformer_blocks:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, **static_kwargs):
-                    def custom_forward(*inputs):
-                        return module(*inputs, **static_kwargs)
-                    return custom_forward
-                ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, c = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block, **new_kwargs),
-                    c,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, c = block(c, **new_kwargs)
-            new_kwargs["encoder_hidden_states"] = encoder_hidden_states
-        hints = torch.unbind(c)[:-1]
-        return hints
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        timestep: torch.LongTensor = None,
-        img_ids: torch.Tensor = None,
-        txt_ids: torch.Tensor = None,
-        guidance: torch.Tensor = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        control_context=None,
-        control_context_scale=1.0,
-        return_dict: bool = True,
-    ):
-        num_txt_tokens = encoder_hidden_states.shape[1]
-        # 1. Calculate timestep embedding and modulation parameters
-        timestep = timestep.to(hidden_states.dtype) * 1000
-        guidance = guidance.to(hidden_states.dtype) * 1000
-        temb = self.time_guidance_embed(timestep, guidance)
-        double_stream_mod_img = self.double_stream_modulation_img(temb)
-        double_stream_mod_txt = self.double_stream_modulation_txt(temb)
-        single_stream_mod = self.single_stream_modulation(temb)[0]
-        # 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
-        hidden_states = self.x_embedder(hidden_states)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-        # 3. Calculate RoPE embeddings from image and text tokens
-        # NOTE: the below logic means that we can't support batched inference with images of different resolutions or
-        # text prompts of differents lengths. Is this a use case we want to support?
-        if img_ids.ndim == 3:
-            img_ids = img_ids[0]
-        if txt_ids.ndim == 3:
-            txt_ids = txt_ids[0]
-        if is_torch_npu_available():
-            freqs_cos_image, freqs_sin_image = self.pos_embed(img_ids.cpu())
-            image_rotary_emb = (freqs_cos_image.npu(), freqs_sin_image.npu())
-            freqs_cos_text, freqs_sin_text = self.pos_embed(txt_ids.cpu())
-            text_rotary_emb = (freqs_cos_text.npu(), freqs_sin_text.npu())
-        else:
-            image_rotary_emb = self.pos_embed(img_ids)
-            text_rotary_emb = self.pos_embed(txt_ids)
-        concat_rotary_emb = (
-            torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0),
-            torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0),
-        )
-        # Arguments
-        kwargs = dict(
-            encoder_hidden_states=encoder_hidden_states,
-            temb_mod_params_img=double_stream_mod_img,
-            temb_mod_params_txt=double_stream_mod_txt,
-            image_rotary_emb=concat_rotary_emb,
-            joint_attention_kwargs=joint_attention_kwargs,
-        )
-        hints = self.forward_control(
-            hidden_states, control_context, kwargs
-        )
-        for index_block, block in enumerate(self.transformer_blocks):
-            # Arguments
-            kwargs = dict(
-                encoder_hidden_states=encoder_hidden_states,
-                temb_mod_params_img=double_stream_mod_img,
-                temb_mod_params_txt=double_stream_mod_txt,
-                image_rotary_emb=concat_rotary_emb,
-                joint_attention_kwargs=joint_attention_kwargs,
-                hints=hints,
-                context_scale=control_context_scale
-            )
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, **static_kwargs):
-                    def custom_forward(*inputs):
-                        return module(*inputs, **static_kwargs)
-                    return custom_forward
-                ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block, **kwargs),
-                    hidden_states,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(hidden_states, **kwargs)
-        for index_block, block in enumerate(self.single_transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    single_stream_mod,
-                    concat_rotary_emb,
-                    joint_attention_kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb_mod_params=single_stream_mod,
-                    image_rotary_emb=concat_rotary_emb,
-                    joint_attention_kwargs=joint_attention_kwargs,
-                )
-        # 6. Output layers
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-        if not return_dict:
-            return (output,)
-        return Transformer2DModelOutput(sample=output)

videox_fun/models/flux2_vae.py DELETED Viewed

@@ -1,543 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/autoencoder_kl_flux2.py
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Dict, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS, CROSS_ATTENTION_PROCESSORS, Attention,
-    AttentionProcessor, AttnAddedKVProcessor, AttnProcessor,
-    FusedAttnProcessor2_0)
-from diffusers.models.autoencoders.vae import (Decoder,
-                                               DecoderOutput,
-                                               DiagonalGaussianDistribution,
-                                               Encoder)
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils import deprecate
-from diffusers.utils.accelerate_utils import apply_forward_hook
-class AutoencoderKLFlux2(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        force_upcast (`bool`, *optional*, default to `True`):
-            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
-            can be fine-tuned / trained to a lower range without losing too much precision in which case `force_upcast`
-            can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
-        mid_block_add_attention (`bool`, *optional*, default to `True`):
-            If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the
-            mid_block will only have resnet blocks
-    """
-    _supports_gradient_checkpointing = True
-    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str, ...] = (
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-            "DownEncoderBlock2D",
-        ),
-        up_block_types: Tuple[str, ...] = (
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-            "UpDecoderBlock2D",
-        ),
-        block_out_channels: Tuple[int, ...] = (
-            128,
-            256,
-            512,
-            512,
-        ),
-        layers_per_block: int = 2,
-        act_fn: str = "silu",
-        latent_channels: int = 32,
-        norm_num_groups: int = 32,
-        sample_size: int = 1024,  # YiYi notes: not sure
-        force_upcast: bool = True,
-        use_quant_conv: bool = True,
-        use_post_quant_conv: bool = True,
-        mid_block_add_attention: bool = True,
-        batch_norm_eps: float = 1e-4,
-        batch_norm_momentum: float = 0.1,
-        patch_size: Tuple[int, int] = (2, 2),
-    ):
-        super().__init__()
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=True,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None
-        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None
-        self.bn = nn.BatchNorm2d(
-            math.prod(patch_size) * latent_channels,
-            eps=batch_norm_eps,
-            momentum=batch_norm_momentum,
-            affine=False,
-            track_running_stats=True,
-        )
-        self.use_slicing = False
-        self.use_tiling = False
-        # only relevant if vae tiling is enabled
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-        self.set_attn_processor(processor)
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, height, width = x.shape
-        if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size):
-            return self._tiled_encode(x)
-        enc = self.encoder(x)
-        if self.quant_conv is not None:
-            enc = self.quant_conv(enc)
-        return enc
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        """
-        Encode a batch of images into latents.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-                The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
-            return self.tiled_decode(z, return_dict=return_dict)
-        if self.post_quant_conv is not None:
-            z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(
-        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
-        """
-        Decode a batch of images.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
-        return b
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
-        return b
-    def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
-        r"""Encode a batch of images using a tiled encoder.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-        Returns:
-            `torch.Tensor`:
-                The latent representation of the encoded videos.
-        """
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                if self.config.use_quant_conv:
-                    tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-        enc = torch.cat(result_rows, dim=2)
-        return enc
-    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
-                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
-                `tuple` is returned.
-        """
-        deprecation_message = (
-            "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the "
-            "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able "
-            "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value."
-        )
-        deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False)
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                if self.config.use_quant_conv:
-                    tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-        moments = torch.cat(result_rows, dim=2)
-        posterior = DiagonalGaussianDistribution(moments)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_sample_min_size - blend_extent
-        # Split z into overlapping 64x64 tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, z.shape[2], overlap_size):
-            row = []
-            for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
-                if self.config.use_post_quant_conv:
-                    tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-                row.append(decoded)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-        dec = torch.cat(result_rows, dim=2)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
-    def fuse_qkv_projections(self):
-        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        are fused. For cross-attention modules, key and value projection matrices are fused.
-        > [!WARNING] > This API is 🧪 experimental.
-        """
-        self.original_attn_processors = None
-        for _, attn_processor in self.attn_processors.items():
-            if "Added" in str(attn_processor.__class__.__name__):
-                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
-        self.original_attn_processors = self.attn_processors
-        for module in self.modules():
-            if isinstance(module, Attention):
-                module.fuse_projections(fuse=True)
-        self.set_attn_processor(FusedAttnProcessor2_0())
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
-    def unfuse_qkv_projections(self):
-        """Disables the fused QKV projection if enabled.
-        > [!WARNING] > This API is 🧪 experimental.
-        """
-        if self.original_attn_processors is not None:
-            self.set_attn_processor(self.original_attn_processors)

videox_fun/models/flux_transformer2d.py DELETED Viewed

@@ -1,832 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_flux.py
-# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import AttentionProcessor
-from diffusers.models.embeddings import (
-    CombinedTimestepGuidanceTextProjEmbeddings,
-    CombinedTimestepTextProjEmbeddings, get_1d_rotary_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import (AdaLayerNormContinuous,
-                                            AdaLayerNormZero,
-                                            AdaLayerNormZeroSingle)
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from ..dist import (FluxMultiGPUsAttnProcessor2_0, get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group)
-from .attention_utils import attention
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def _get_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    query = attn.to_q(hidden_states)
-    key = attn.to_k(hidden_states)
-    value = attn.to_v(hidden_states)
-    encoder_query = encoder_key = encoder_value = None
-    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
-        encoder_query = attn.add_q_proj(encoder_hidden_states)
-        encoder_key = attn.add_k_proj(encoder_hidden_states)
-        encoder_value = attn.add_v_proj(encoder_hidden_states)
-    return query, key, value, encoder_query, encoder_key, encoder_value
-def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_states=None):
-    return _get_projections(attn, hidden_states, encoder_hidden_states)
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-    sequence_dim: int = 2,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        if sequence_dim == 2:
-            cos = cos[None, None, :, :]
-            sin = sin[None, None, :, :]
-        elif sequence_dim == 1:
-            cos = cos[None, :, None, :]
-            sin = sin[None, :, None, :]
-        else:
-            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        # used for lumina
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class FluxAttnProcessor:
-    _attention_backend = None
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
-    def __call__(
-        self,
-        attn: "FluxAttention",
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        text_seq_len: int = None,
-    ) -> torch.Tensor:
-        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
-            attn, hidden_states, encoder_hidden_states
-        )
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-        if attn.added_kv_proj_dim is not None:
-            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
-            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
-            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
-            encoder_query = attn.norm_added_q(encoder_query)
-            encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([encoder_query, query], dim=1)
-            key = torch.cat([encoder_key, key], dim=1)
-            value = torch.cat([encoder_value, value], dim=1)
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
-            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
-        hidden_states = attention(
-            query, key, value, attn_mask=attention_mask,
-        )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        if encoder_hidden_states is not None:
-            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
-                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
-            )
-            hidden_states = attn.to_out[0](hidden_states)
-            hidden_states = attn.to_out[1](hidden_states)
-            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-            return hidden_states, encoder_hidden_states
-        else:
-            return hidden_states
-class FluxAttention(torch.nn.Module):
-    _default_processor_cls = FluxAttnProcessor
-    _available_processors = [
-        FluxAttnProcessor,
-    ]
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias: bool = False,
-        added_kv_proj_dim: Optional[int] = None,
-        added_proj_bias: Optional[bool] = True,
-        out_bias: bool = True,
-        eps: float = 1e-5,
-        out_dim: int = None,
-        context_pre_only: Optional[bool] = None,
-        pre_only: bool = False,
-        elementwise_affine: bool = True,
-        processor=None,
-    ):
-        super().__init__()
-        self.head_dim = dim_head
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.use_bias = bias
-        self.dropout = dropout
-        self.out_dim = out_dim if out_dim is not None else query_dim
-        self.context_pre_only = context_pre_only
-        self.pre_only = pre_only
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.added_proj_bias = added_proj_bias
-        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
-        if not self.pre_only:
-            self.to_out = torch.nn.ModuleList([])
-            self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
-            self.to_out.append(torch.nn.Dropout(dropout))
-        if added_kv_proj_dim is not None:
-            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
-            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
-            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
-        if processor is None:
-            self.processor = self._default_processor_cls()
-        else:
-            self.processor = processor
-    def set_processor(self, processor: "AttnProcessor") -> None:
-        r"""
-        Set the attention processor to use.
-        Args:
-            processor (`AttnProcessor`):
-                The attention processor to use.
-        """
-        # if current processor is in `self._modules` and if passed `processor` is not, we need to
-        # pop `processor` from `self._modules`
-        if (
-            hasattr(self, "processor")
-            and isinstance(self.processor, torch.nn.Module)
-            and not isinstance(processor, torch.nn.Module)
-        ):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
-            self._modules.pop("processor")
-        self.processor = processor
-    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
-        r"""
-        Get the attention processor in use.
-        Args:
-            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
-                Set to `True` to return the deprecated LoRA attention processor.
-        Returns:
-            "AttentionProcessor": The attention processor in use.
-        """
-        if not return_deprecated_lora:
-            return self.processor
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
-        quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
-        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters]
-        if len(unused_kwargs) > 0:
-            logger.warning(
-                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
-            )
-        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
-        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
-@maybe_allow_in_graph
-class FluxSingleTransformerBlock(nn.Module):
-    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
-        super().__init__()
-        self.mlp_hidden_dim = int(dim * mlp_ratio)
-        self.norm = AdaLayerNormZeroSingle(dim)
-        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
-        self.attn = FluxAttention(
-            query_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            bias=True,
-            processor=FluxAttnProcessor(),
-            eps=1e-6,
-            pre_only=True,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        text_seq_len = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
-        residual = hidden_states
-        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            text_seq_len=text_seq_len,
-            **joint_attention_kwargs,
-        )
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        gate = gate.unsqueeze(1)
-        hidden_states = gate * self.proj_out(hidden_states)
-        hidden_states = residual + hidden_states
-        if hidden_states.dtype == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-        encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
-        return encoder_hidden_states, hidden_states
-@maybe_allow_in_graph
-class FluxTransformerBlock(nn.Module):
-    def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
-    ):
-        super().__init__()
-        self.norm1 = AdaLayerNormZero(dim)
-        self.norm1_context = AdaLayerNormZero(dim)
-        self.attn = FluxAttention(
-            query_dim=dim,
-            added_kv_proj_dim=dim,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            context_pre_only=False,
-            bias=True,
-            processor=FluxAttnProcessor(),
-            eps=eps,
-        )
-        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
-        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-            encoder_hidden_states, emb=temb
-        )
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        # Attention.
-        attention_outputs = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            image_rotary_emb=image_rotary_emb,
-            **joint_attention_kwargs,
-        )
-        if len(attention_outputs) == 2:
-            attn_output, context_attn_output = attention_outputs
-        elif len(attention_outputs) == 3:
-            attn_output, context_attn_output, ip_attn_output = attention_outputs
-        # Process attention outputs for the `hidden_states`.
-        attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = hidden_states + attn_output
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        ff_output = self.ff(norm_hidden_states)
-        ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = hidden_states + ff_output
-        if len(attention_outputs) == 3:
-            hidden_states = hidden_states + ip_attn_output
-        # Process attention outputs for the `encoder_hidden_states`.
-        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
-        encoder_hidden_states = encoder_hidden_states + context_attn_output
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
-        if encoder_hidden_states.dtype == torch.float16:
-            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-        return encoder_hidden_states, hidden_states
-class FluxPosEmbed(nn.Module):
-    # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        n_axes = ids.shape[-1]
-        cos_out = []
-        sin_out = []
-        pos = ids.float()
-        is_mps = ids.device.type == "mps"
-        is_npu = ids.device.type == "npu"
-        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
-        for i in range(n_axes):
-            cos, sin = get_1d_rotary_pos_embed(
-                self.axes_dim[i],
-                pos[:, i],
-                theta=self.theta,
-                repeat_interleave_real=True,
-                use_real=True,
-                freqs_dtype=freqs_dtype,
-            )
-            cos_out.append(cos)
-            sin_out.append(sin)
-        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
-        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
-        return freqs_cos, freqs_sin
-class FluxTransformer2DModel(
-    ModelMixin,
-    ConfigMixin,
-    PeftAdapterMixin,
-    FromOriginalModelMixin,
-):
-    """
-    The Transformer model introduced in Flux.
-    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
-    Args:
-        patch_size (`int`, defaults to `1`):
-            Patch size to turn the input data into small patches.
-        in_channels (`int`, defaults to `64`):
-            The number of channels in the input.
-        out_channels (`int`, *optional*, defaults to `None`):
-            The number of channels in the output. If not specified, it defaults to `in_channels`.
-        num_layers (`int`, defaults to `19`):
-            The number of layers of dual stream DiT blocks to use.
-        num_single_layers (`int`, defaults to `38`):
-            The number of layers of single stream DiT blocks to use.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of dimensions to use for each attention head.
-        num_attention_heads (`int`, defaults to `24`):
-            The number of attention heads to use.
-        joint_attention_dim (`int`, defaults to `4096`):
-            The number of dimensions to use for the joint attention (embedding/channel dimension of
-            `encoder_hidden_states`).
-        pooled_projection_dim (`int`, defaults to `768`):
-            The number of dimensions to use for the pooled projection.
-        guidance_embeds (`bool`, defaults to `False`):
-            Whether to use guidance embeddings for guidance-distilled variant of the model.
-        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
-            The dimensions to use for the rotary positional embeddings.
-    """
-    _supports_gradient_checkpointing = True
-    # _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
-    # _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
-    # _repeated_blocks = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
-    @register_to_config
-    def __init__(
-        self,
-        patch_size: int = 1,
-        in_channels: int = 64,
-        out_channels: Optional[int] = None,
-        num_layers: int = 19,
-        num_single_layers: int = 38,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 24,
-        joint_attention_dim: int = 4096,
-        pooled_projection_dim: int = 768,
-        guidance_embeds: bool = False,
-        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
-    ):
-        super().__init__()
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
-        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
-        text_time_guidance_cls = (
-            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
-        )
-        self.time_text_embed = text_time_guidance_cls(
-            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
-        )
-        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
-        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
-        self.transformer_blocks = nn.ModuleList(
-            [
-                FluxTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                FluxSingleTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                )
-                for _ in range(num_single_layers)
-            ]
-        )
-        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
-        self.gradient_checkpointing = False
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        self.set_attn_processor(FluxMultiGPUsAttnProcessor2_0())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        pooled_projections: torch.Tensor = None,
-        timestep: torch.LongTensor = None,
-        img_ids: torch.Tensor = None,
-        txt_ids: torch.Tensor = None,
-        guidance: torch.Tensor = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_block_samples=None,
-        controlnet_single_block_samples=None,
-        return_dict: bool = True,
-        controlnet_blocks_repeat: bool = False,
-    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
-        """
-        The [`FluxTransformer2DModel`] forward method.
-        Args:
-            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
-                Input `hidden_states`.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
-                from the embeddings of input conditions.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-                A list of tensors that if specified are added to the residuals of transformer blocks.
-            joint_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        if joint_attention_kwargs is not None:
-            joint_attention_kwargs = joint_attention_kwargs.copy()
-            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-        hidden_states = self.x_embedder(hidden_states)
-        timestep = timestep.to(hidden_states.dtype) * 1000
-        if guidance is not None:
-            guidance = guidance.to(hidden_states.dtype) * 1000
-        temb = (
-            self.time_text_embed(timestep, pooled_projections)
-            if guidance is None
-            else self.time_text_embed(timestep, guidance, pooled_projections)
-        )
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-        if txt_ids.ndim == 3:
-            logger.warning(
-                "Passing `txt_ids` 3d torch.Tensor is deprecated."
-                "Please remove the batch dimension and pass it as a 2d torch Tensor"
-            )
-            txt_ids = txt_ids[0]
-        if img_ids.ndim == 3:
-            logger.warning(
-                "Passing `img_ids` 3d torch.Tensor is deprecated."
-                "Please remove the batch dimension and pass it as a 2d torch Tensor"
-            )
-            img_ids = img_ids[0]
-        ids = torch.cat((txt_ids, img_ids), dim=0)
-        image_rotary_emb = self.pos_embed(ids)
-        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
-            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
-            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
-            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
-        # Context Parallel
-        if self.sp_world_size > 1:
-            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if image_rotary_emb is not None:
-                txt_rotary_emb = (
-                    image_rotary_emb[0][:encoder_hidden_states.shape[1]],
-                    image_rotary_emb[1][:encoder_hidden_states.shape[1]]
-                )
-                image_rotary_emb = (
-                    torch.chunk(image_rotary_emb[0][encoder_hidden_states.shape[1]:], self.sp_world_size, dim=0)[self.sp_world_rank],
-                    torch.chunk(image_rotary_emb[1][encoder_hidden_states.shape[1]:], self.sp_world_size, dim=0)[self.sp_world_rank],
-                )
-                image_rotary_emb = [torch.cat([_txt_rotary_emb, _image_rotary_emb], dim=0) \
-                    for _txt_rotary_emb, _image_rotary_emb in zip(txt_rotary_emb, image_rotary_emb)]
-        for index_block, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    image_rotary_emb,
-                    joint_attention_kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                    joint_attention_kwargs=joint_attention_kwargs,
-                )
-            # controlnet residual
-            if controlnet_block_samples is not None:
-                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
-                interval_control = int(np.ceil(interval_control))
-                # For Xlabs ControlNet.
-                if controlnet_blocks_repeat:
-                    hidden_states = (
-                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
-                    )
-                else:
-                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
-        for index_block, block in enumerate(self.single_transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    image_rotary_emb,
-                    joint_attention_kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                    joint_attention_kwargs=joint_attention_kwargs,
-                )
-            # controlnet residual
-            if controlnet_single_block_samples is not None:
-                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
-                interval_control = int(np.ceil(interval_control))
-                hidden_states = hidden_states + controlnet_single_block_samples[index_block // interval_control]
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-        if self.sp_world_size > 1:
-            output = self.all_gather(output, dim=1)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-        if not return_dict:
-            return (output,)
-        return Transformer2DModelOutput(sample=output)

videox_fun/models/hunyuanvideo_transformer3d.py DELETED Viewed

@@ -1,1478 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
-# Copyright 2025 The Hunyuan Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import json
-import os
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.models.embeddings import (CombinedTimestepTextProjEmbeddings,
-                                         PixArtAlphaTextProjection,
-                                         TimestepEmbedding, Timesteps,
-                                         get_1d_rotary_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import (AdaLayerNormContinuous,
-                                            AdaLayerNormZero,
-                                            AdaLayerNormZeroSingle,
-                                            FP32LayerNorm)
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from ..dist import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    xFuserLongContextAttention)
-from ..dist.hunyuanvideo_xfuser import HunyuanVideoMultiGPUsAttnProcessor2_0
-from .attention_utils import attention
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def apply_rotary_emb(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-    sequence_dim: int = 2,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        if sequence_dim == 2:
-            cos = cos[None, None, :, :]
-            sin = sin[None, None, :, :]
-        elif sequence_dim == 1:
-            cos = cos[None, :, None, :]
-            sin = sin[None, :, None, :]
-        else:
-            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        # used for lumina
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(2)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-def extract_seqlens_from_mask(attn_mask):
-    if attn_mask is None:
-        return None
-    if len(attn_mask.shape) == 4:
-        bs, _, _, seq_len = attn_mask.shape
-        if attn_mask.dtype == torch.bool:
-            valid_mask = attn_mask.squeeze(1).squeeze(1)
-        else:
-            valid_mask = ~torch.isinf(attn_mask.squeeze(1).squeeze(1))
-    elif len(attn_mask.shape) == 3:
-        raise ValueError(
-            "attn_mask should be 2D or 4D tensor, but got {}".format(
-                attn_mask.shape))
-    seqlens = valid_mask.sum(dim=1)
-    return seqlens
-class HunyuanVideoAttnProcessor2_0:
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "HunyuanVideoAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0."
-            )
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if attn.add_q_proj is None and encoder_hidden_states is not None:
-            hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        # 1. QKV projections
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        # 2. QK normalization
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # 3. Rotational positional embeddings applied to latent stream
-        if image_rotary_emb is not None:
-            if attn.add_q_proj is None and encoder_hidden_states is not None:
-                query = torch.cat(
-                    [
-                        apply_rotary_emb(query[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        query[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-                key = torch.cat(
-                    [
-                        apply_rotary_emb(key[:, :, : -encoder_hidden_states.shape[1]], image_rotary_emb),
-                        key[:, :, -encoder_hidden_states.shape[1] :],
-                    ],
-                    dim=2,
-                )
-            else:
-                query = apply_rotary_emb(query, image_rotary_emb)
-                key = apply_rotary_emb(key, image_rotary_emb)
-        # 4. Encoder condition QKV projection and normalization
-        if attn.add_q_proj is not None and encoder_hidden_states is not None:
-            encoder_query = attn.add_q_proj(encoder_hidden_states)
-            encoder_key = attn.add_k_proj(encoder_hidden_states)
-            encoder_value = attn.add_v_proj(encoder_hidden_states)
-            encoder_query = encoder_query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_key = encoder_key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            encoder_value = encoder_value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-            if attn.norm_added_q is not None:
-                encoder_query = attn.norm_added_q(encoder_query)
-            if attn.norm_added_k is not None:
-                encoder_key = attn.norm_added_k(encoder_key)
-            query = torch.cat([query, encoder_query], dim=2)
-            key = torch.cat([key, encoder_key], dim=2)
-            value = torch.cat([value, encoder_value], dim=2)
-        # 5. Attention
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-        if attention_mask is not None:
-            q_lens = k_lens = extract_seqlens_from_mask(attention_mask)
-            hidden_states = torch.zeros_like(query)
-            for i in range(len(q_lens)):
-                hidden_states[i][:q_lens[i]] = attention(
-                    query[i][:q_lens[i]].unsqueeze(0),
-                    key[i][:q_lens[i]].unsqueeze(0),
-                    value[i][:q_lens[i]].unsqueeze(0),
-                    attn_mask=None,
-                )
-        else:
-            hidden_states = attention(
-                query, key, value, attn_mask=attention_mask,
-            )
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(query.dtype)
-        # 6. Output projection
-        if encoder_hidden_states is not None:
-            hidden_states, encoder_hidden_states = (
-                hidden_states[:, : -encoder_hidden_states.shape[1]],
-                hidden_states[:, -encoder_hidden_states.shape[1] :],
-            )
-            if getattr(attn, "to_out", None) is not None:
-                hidden_states = attn.to_out[0](hidden_states)
-                hidden_states = attn.to_out[1](hidden_states)
-            if getattr(attn, "to_add_out", None) is not None:
-                encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoPatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: Union[int, Tuple[int, int, int]] = 16,
-        in_chans: int = 3,
-        embed_dim: int = 768,
-    ) -> None:
-        super().__init__()
-        patch_size = (patch_size, patch_size, patch_size) if isinstance(patch_size, int) else patch_size
-        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj(hidden_states)
-        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # BCFHW -> BNC
-        return hidden_states
-class HunyuanVideoAdaNorm(nn.Module):
-    def __init__(self, in_features: int, out_features: Optional[int] = None) -> None:
-        super().__init__()
-        out_features = out_features or 2 * in_features
-        self.linear = nn.Linear(in_features, out_features)
-        self.nonlinearity = nn.SiLU()
-    def forward(
-        self, temb: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        temb = self.linear(self.nonlinearity(temb))
-        gate_msa, gate_mlp = temb.chunk(2, dim=1)
-        gate_msa, gate_mlp = gate_msa.unsqueeze(1), gate_mlp.unsqueeze(1)
-        return gate_msa, gate_mlp
-class HunyuanVideoTokenReplaceAdaLayerNormZero(nn.Module):
-    def __init__(self, embedding_dim: int, norm_type: str = "layer_norm", bias: bool = True):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias)
-        if norm_type == "layer_norm":
-            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        elif norm_type == "fp32_layer_norm":
-            self.norm = FP32LayerNorm(embedding_dim, elementwise_affine=False, bias=False)
-        else:
-            raise ValueError(
-                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
-            )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        emb: torch.Tensor,
-        token_replace_emb: torch.Tensor,
-        first_frame_num_tokens: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        emb = self.linear(self.silu(emb))
-        token_replace_emb = self.linear(self.silu(token_replace_emb))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
-        tr_shift_msa, tr_scale_msa, tr_gate_msa, tr_shift_mlp, tr_scale_mlp, tr_gate_mlp = token_replace_emb.chunk(
-            6, dim=1
-        )
-        norm_hidden_states = self.norm(hidden_states)
-        hidden_states_zero = (
-            norm_hidden_states[:, :first_frame_num_tokens] * (1 + tr_scale_msa[:, None]) + tr_shift_msa[:, None]
-        )
-        hidden_states_orig = (
-            norm_hidden_states[:, first_frame_num_tokens:] * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        )
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        return (
-            hidden_states,
-            gate_msa,
-            shift_mlp,
-            scale_mlp,
-            gate_mlp,
-            tr_gate_msa,
-            tr_shift_mlp,
-            tr_scale_mlp,
-            tr_gate_mlp,
-        )
-class HunyuanVideoTokenReplaceAdaLayerNormZeroSingle(nn.Module):
-    def __init__(self, embedding_dim: int, norm_type: str = "layer_norm", bias: bool = True):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias)
-        if norm_type == "layer_norm":
-            self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            raise ValueError(
-                f"Unsupported `norm_type` ({norm_type}) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'."
-            )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        emb: torch.Tensor,
-        token_replace_emb: torch.Tensor,
-        first_frame_num_tokens: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        emb = self.linear(self.silu(emb))
-        token_replace_emb = self.linear(self.silu(token_replace_emb))
-        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
-        tr_shift_msa, tr_scale_msa, tr_gate_msa = token_replace_emb.chunk(3, dim=1)
-        norm_hidden_states = self.norm(hidden_states)
-        hidden_states_zero = (
-            norm_hidden_states[:, :first_frame_num_tokens] * (1 + tr_scale_msa[:, None]) + tr_shift_msa[:, None]
-        )
-        hidden_states_orig = (
-            norm_hidden_states[:, first_frame_num_tokens:] * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        )
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        return hidden_states, gate_msa, tr_gate_msa
-class HunyuanVideoConditionEmbedding(nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        pooled_projection_dim: int,
-        guidance_embeds: bool,
-        image_condition_type: Optional[str] = None,
-    ):
-        super().__init__()
-        self.image_condition_type = image_condition_type
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
-        self.guidance_embedder = None
-        if guidance_embeds:
-            self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-    def forward(
-        self, timestep: torch.Tensor, pooled_projection: torch.Tensor, guidance: Optional[torch.Tensor] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype))  # (N, D)
-        pooled_projections = self.text_embedder(pooled_projection)
-        conditioning = timesteps_emb + pooled_projections
-        token_replace_emb = None
-        if self.image_condition_type == "token_replace":
-            token_replace_timestep = torch.zeros_like(timestep)
-            token_replace_proj = self.time_proj(token_replace_timestep)
-            token_replace_emb = self.timestep_embedder(token_replace_proj.to(dtype=pooled_projection.dtype))
-            token_replace_emb = token_replace_emb + pooled_projections
-        if self.guidance_embedder is not None:
-            guidance_proj = self.time_proj(guidance)
-            guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype))
-            conditioning = conditioning + guidance_emb
-        return conditioning, token_replace_emb
-class HunyuanVideoIndividualTokenRefinerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            bias=attention_bias,
-        )
-        self.attn.set_processor = None
-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_width_ratio, activation_fn="linear-silu", dropout=mlp_drop_rate)
-        self.norm_out = HunyuanVideoAdaNorm(hidden_size, 2 * hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        norm_hidden_states = self.norm1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=None,
-            attention_mask=attention_mask,
-        )
-        gate_msa, gate_mlp = self.norm_out(temb)
-        hidden_states = hidden_states + attn_output * gate_msa
-        ff_output = self.ff(self.norm2(hidden_states))
-        hidden_states = hidden_states + ff_output * gate_mlp
-        return hidden_states
-class HunyuanVideoIndividualTokenRefiner(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_layers: int,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        self.refiner_blocks = nn.ModuleList(
-            [
-                HunyuanVideoIndividualTokenRefinerBlock(
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    mlp_width_ratio=mlp_width_ratio,
-                    mlp_drop_rate=mlp_drop_rate,
-                    attention_bias=attention_bias,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> None:
-        self_attn_mask = None
-        if attention_mask is not None:
-            batch_size = attention_mask.shape[0]
-            seq_len = attention_mask.shape[1]
-            attention_mask = attention_mask.to(hidden_states.device).bool()
-            self_attn_mask_1 = attention_mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
-            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
-            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
-            self_attn_mask[:, :, :, 0] = True
-        for block in self.refiner_blocks:
-            hidden_states = block(hidden_states, temb, self_attn_mask)
-        return hidden_states
-class HunyuanVideoTokenRefiner(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_layers: int,
-        mlp_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        attention_bias: bool = True,
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.time_text_embed = CombinedTimestepTextProjEmbeddings(
-            embedding_dim=hidden_size, pooled_projection_dim=in_channels
-        )
-        self.proj_in = nn.Linear(in_channels, hidden_size, bias=True)
-        self.token_refiner = HunyuanVideoIndividualTokenRefiner(
-            num_attention_heads=num_attention_heads,
-            attention_head_dim=attention_head_dim,
-            num_layers=num_layers,
-            mlp_width_ratio=mlp_ratio,
-            mlp_drop_rate=mlp_drop_rate,
-            attention_bias=attention_bias,
-        )
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-    ) -> torch.Tensor:
-        if attention_mask is None:
-            pooled_projections = hidden_states.mean(dim=1)
-        else:
-            original_dtype = hidden_states.dtype
-            mask_float = attention_mask.float().unsqueeze(-1)
-            pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
-            pooled_projections = pooled_projections.to(original_dtype)
-        temb = self.time_text_embed(timestep, pooled_projections)
-        hidden_states = self.proj_in(hidden_states)
-        hidden_states = self.token_refiner(hidden_states, temb, attention_mask)
-        return hidden_states
-class HunyuanVideoRotaryPosEmbed(nn.Module):
-    def __init__(self, patch_size: int, patch_size_t: int, rope_dim: List[int], theta: float = 256.0) -> None:
-        super().__init__()
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.rope_dim = rope_dim
-        self.theta = theta
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size]
-        axes_grids = []
-        for i in range(3):
-            # Note: The following line diverges from original behaviour. We create the grid on the device, whereas
-            # original implementation creates it on CPU and then moves it to device. This results in numerical
-            # differences in layerwise debugging outputs, but visually it is the same.
-            grid = torch.arange(0, rope_sizes[i], device=hidden_states.device, dtype=torch.float32)
-            axes_grids.append(grid)
-        grid = torch.meshgrid(*axes_grids, indexing="ij")  # [W, H, T]
-        grid = torch.stack(grid, dim=0)  # [3, W, H, T]
-        freqs = []
-        for i in range(3):
-            freq = get_1d_rotary_pos_embed(self.rope_dim[i], grid[i].reshape(-1), self.theta, use_real=True)
-            freqs.append(freq)
-        freqs_cos = torch.cat([f[0] for f in freqs], dim=1)  # (W * H * T, D / 2)
-        freqs_sin = torch.cat([f[1] for f in freqs], dim=1)  # (W * H * T, D / 2)
-        return freqs_cos, freqs_sin
-class HunyuanVideoSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 4.0,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        mlp_dim = int(hidden_size * mlp_ratio)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            bias=True,
-            processor=HunyuanVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-            pre_only=True,
-        )
-        self.norm = AdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
-        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        text_seq_length = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        residual = hidden_states
-        # 1. Input normalization
-        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        norm_hidden_states, norm_encoder_hidden_states = (
-            norm_hidden_states[:, :-text_seq_length, :],
-            norm_hidden_states[:, -text_seq_length:, :],
-        )
-        # 2. Attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=image_rotary_emb,
-        )
-        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
-        # 3. Modulation and residual connection
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        hidden_states = gate.unsqueeze(1) * self.proj_out(hidden_states)
-        hidden_states = hidden_states + residual
-        hidden_states, encoder_hidden_states = (
-            hidden_states[:, :-text_seq_length, :],
-            hidden_states[:, -text_seq_length:, :],
-        )
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            added_kv_proj_dim=hidden_size,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            context_pre_only=False,
-            bias=True,
-            processor=HunyuanVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-        )
-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-        self.norm2_context = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        *args,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # 1. Input normalization
-        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-            encoder_hidden_states, emb=temb
-        )
-        # 2. Joint attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=freqs_cis,
-        )
-        # 3. Modulation and residual connection
-        hidden_states = hidden_states + attn_output * gate_msa.unsqueeze(1)
-        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa.unsqueeze(1)
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
-        # 4. Feed-forward
-        ff_output = self.ff(norm_hidden_states)
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoTokenReplaceSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float = 4.0,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        mlp_dim = int(hidden_size * mlp_ratio)
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            bias=True,
-            processor=HunyuanVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-            pre_only=True,
-        )
-        self.norm = HunyuanVideoTokenReplaceAdaLayerNormZeroSingle(hidden_size, norm_type="layer_norm")
-        self.proj_mlp = nn.Linear(hidden_size, mlp_dim)
-        self.act_mlp = nn.GELU(approximate="tanh")
-        self.proj_out = nn.Linear(hidden_size + mlp_dim, hidden_size)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        token_replace_emb: torch.Tensor = None,
-        num_tokens: int = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        text_seq_length = encoder_hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1)
-        residual = hidden_states
-        # 1. Input normalization
-        norm_hidden_states, gate, tr_gate = self.norm(hidden_states, temb, token_replace_emb, num_tokens)
-        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
-        norm_hidden_states, norm_encoder_hidden_states = (
-            norm_hidden_states[:, :-text_seq_length, :],
-            norm_hidden_states[:, -text_seq_length:, :],
-        )
-        # 2. Attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=image_rotary_emb,
-        )
-        attn_output = torch.cat([attn_output, context_attn_output], dim=1)
-        # 3. Modulation and residual connection
-        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
-        proj_output = self.proj_out(hidden_states)
-        hidden_states_zero = proj_output[:, :num_tokens] * tr_gate.unsqueeze(1)
-        hidden_states_orig = proj_output[:, num_tokens:] * gate.unsqueeze(1)
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        hidden_states = hidden_states + residual
-        hidden_states, encoder_hidden_states = (
-            hidden_states[:, :-text_seq_length, :],
-            hidden_states[:, -text_seq_length:, :],
-        )
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoTokenReplaceTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        mlp_ratio: float,
-        qk_norm: str = "rms_norm",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-        self.norm1 = HunyuanVideoTokenReplaceAdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.norm1_context = AdaLayerNormZero(hidden_size, norm_type="layer_norm")
-        self.attn = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            added_kv_proj_dim=hidden_size,
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=hidden_size,
-            context_pre_only=False,
-            bias=True,
-            processor=HunyuanVideoAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=1e-6,
-        )
-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-        self.norm2_context = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.ff_context = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu-approximate")
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        temb: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        token_replace_emb: torch.Tensor = None,
-        num_tokens: int = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # 1. Input normalization
-        (
-            norm_hidden_states,
-            gate_msa,
-            shift_mlp,
-            scale_mlp,
-            gate_mlp,
-            tr_gate_msa,
-            tr_shift_mlp,
-            tr_scale_mlp,
-            tr_gate_mlp,
-        ) = self.norm1(hidden_states, temb, token_replace_emb, num_tokens)
-        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-            encoder_hidden_states, emb=temb
-        )
-        # 2. Joint attention
-        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states,
-            encoder_hidden_states=norm_encoder_hidden_states,
-            attention_mask=attention_mask,
-            image_rotary_emb=freqs_cis,
-        )
-        # 3. Modulation and residual connection
-        hidden_states_zero = hidden_states[:, :num_tokens] + attn_output[:, :num_tokens] * tr_gate_msa.unsqueeze(1)
-        hidden_states_orig = hidden_states[:, num_tokens:] + attn_output[:, num_tokens:] * gate_msa.unsqueeze(1)
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        encoder_hidden_states = encoder_hidden_states + context_attn_output * c_gate_msa.unsqueeze(1)
-        norm_hidden_states = self.norm2(hidden_states)
-        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
-        hidden_states_zero = norm_hidden_states[:, :num_tokens] * (1 + tr_scale_mlp[:, None]) + tr_shift_mlp[:, None]
-        hidden_states_orig = norm_hidden_states[:, num_tokens:] * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        norm_hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
-        # 4. Feed-forward
-        ff_output = self.ff(norm_hidden_states)
-        context_ff_output = self.ff_context(norm_encoder_hidden_states)
-        hidden_states_zero = hidden_states[:, :num_tokens] + ff_output[:, :num_tokens] * tr_gate_mlp.unsqueeze(1)
-        hidden_states_orig = hidden_states[:, num_tokens:] + ff_output[:, num_tokens:] * gate_mlp.unsqueeze(1)
-        hidden_states = torch.cat([hidden_states_zero, hidden_states_orig], dim=1)
-        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
-        return hidden_states, encoder_hidden_states
-class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    A Transformer model for video-like data used in [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo).
-    Args:
-        in_channels (`int`, defaults to `16`):
-            The number of channels in the input.
-        out_channels (`int`, defaults to `16`):
-            The number of channels in the output.
-        num_attention_heads (`int`, defaults to `24`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of channels in each head.
-        num_layers (`int`, defaults to `20`):
-            The number of layers of dual-stream blocks to use.
-        num_single_layers (`int`, defaults to `40`):
-            The number of layers of single-stream blocks to use.
-        num_refiner_layers (`int`, defaults to `2`):
-            The number of layers of refiner blocks to use.
-        mlp_ratio (`float`, defaults to `4.0`):
-            The ratio of the hidden layer size to the input size in the feedforward network.
-        patch_size (`int`, defaults to `2`):
-            The size of the spatial patches to use in the patch embedding layer.
-        patch_size_t (`int`, defaults to `1`):
-            The size of the tmeporal patches to use in the patch embedding layer.
-        qk_norm (`str`, defaults to `rms_norm`):
-            The normalization to use for the query and key projections in the attention layers.
-        guidance_embeds (`bool`, defaults to `True`):
-            Whether to use guidance embeddings in the model.
-        text_embed_dim (`int`, defaults to `4096`):
-            Input dimension of text embeddings from the text encoder.
-        pooled_projection_dim (`int`, defaults to `768`):
-            The dimension of the pooled projection of the text embeddings.
-        rope_theta (`float`, defaults to `256.0`):
-            The value of theta to use in the RoPE layer.
-        rope_axes_dim (`Tuple[int]`, defaults to `(16, 56, 56)`):
-            The dimensions of the axes to use in the RoPE layer.
-        image_condition_type (`str`, *optional*, defaults to `None`):
-            The type of image conditioning to use. If `None`, no image conditioning is used. If `latent_concat`, the
-            image is concatenated to the latent stream. If `token_replace`, the image is used to replace first-frame
-            tokens in the latent stream and apply conditioning.
-    """
-    _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
-    _no_split_modules = [
-        "HunyuanVideoTransformerBlock",
-        "HunyuanVideoSingleTransformerBlock",
-        "HunyuanVideoPatchEmbed",
-        "HunyuanVideoTokenRefiner",
-    ]
-    _repeated_blocks = [
-        "HunyuanVideoTransformerBlock",
-        "HunyuanVideoSingleTransformerBlock",
-        "HunyuanVideoPatchEmbed",
-        "HunyuanVideoTokenRefiner",
-    ]
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 16,
-        num_attention_heads: int = 24,
-        attention_head_dim: int = 128,
-        num_layers: int = 20,
-        num_single_layers: int = 40,
-        num_refiner_layers: int = 2,
-        mlp_ratio: float = 4.0,
-        patch_size: int = 2,
-        patch_size_t: int = 1,
-        qk_norm: str = "rms_norm",
-        guidance_embeds: bool = True,
-        text_embed_dim: int = 4096,
-        pooled_projection_dim: int = 768,
-        rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int, ...] = (16, 56, 56),
-        image_condition_type: Optional[str] = None,
-    ) -> None:
-        super().__init__()
-        supported_image_condition_types = ["latent_concat", "token_replace"]
-        if image_condition_type is not None and image_condition_type not in supported_image_condition_types:
-            raise ValueError(
-                f"Invalid `image_condition_type` ({image_condition_type}). Supported ones are: {supported_image_condition_types}"
-            )
-        inner_dim = num_attention_heads * attention_head_dim
-        out_channels = out_channels or in_channels
-        # 1. Latent and condition embedders
-        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
-        self.context_embedder = HunyuanVideoTokenRefiner(
-            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
-        )
-        self.time_text_embed = HunyuanVideoConditionEmbedding(
-            inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
-        )
-        # 2. RoPE
-        self.rope = HunyuanVideoRotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
-        # 3. Dual stream transformer blocks
-        if image_condition_type == "token_replace":
-            self.transformer_blocks = nn.ModuleList(
-                [
-                    HunyuanVideoTokenReplaceTransformerBlock(
-                        num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                    )
-                    for _ in range(num_layers)
-                ]
-            )
-        else:
-            self.transformer_blocks = nn.ModuleList(
-                [
-                    HunyuanVideoTransformerBlock(
-                        num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                    )
-                    for _ in range(num_layers)
-                ]
-            )
-        # 4. Single stream transformer blocks
-        if image_condition_type == "token_replace":
-            self.single_transformer_blocks = nn.ModuleList(
-                [
-                    HunyuanVideoTokenReplaceSingleTransformerBlock(
-                        num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                    )
-                    for _ in range(num_single_layers)
-                ]
-            )
-        else:
-            self.single_transformer_blocks = nn.ModuleList(
-                [
-                    HunyuanVideoSingleTransformerBlock(
-                        num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                    )
-                    for _ in range(num_single_layers)
-                ]
-            )
-        # 5. Output projection
-        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
-        self.gradient_checkpointing = False
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.set_attn_processor(HunyuanVideoMultiGPUsAttnProcessor2_0())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor") and module.set_processor is not None:
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_attention_mask: torch.Tensor,
-        pooled_projections: torch.Tensor,
-        guidance: torch.Tensor = None,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
-        if attention_kwargs is not None:
-            attention_kwargs = attention_kwargs.copy()
-            lora_scale = attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p, p_t = self.config.patch_size, self.config.patch_size_t
-        post_patch_num_frames = num_frames // p_t
-        post_patch_height = height // p
-        post_patch_width = width // p
-        first_frame_num_tokens = 1 * post_patch_height * post_patch_width
-        # 1. RoPE
-        image_rotary_emb = self.rope(hidden_states)
-        # 2. Conditional embeddings
-        temb, token_replace_emb = self.time_text_embed(timestep, pooled_projections, guidance)
-        hidden_states = self.x_embedder(hidden_states)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
-        # 3. Attention mask preparation
-        latent_sequence_length = hidden_states.shape[1]
-        condition_sequence_length = encoder_hidden_states.shape[1]
-        sequence_length = latent_sequence_length + condition_sequence_length
-        attention_mask = torch.ones(
-            batch_size, sequence_length, device=hidden_states.device, dtype=torch.bool
-        )  # [B, N]
-        effective_condition_sequence_length = encoder_attention_mask.sum(dim=1, dtype=torch.int)  # [B,]
-        effective_sequence_length = latent_sequence_length + effective_condition_sequence_length
-        indices = torch.arange(sequence_length, device=hidden_states.device).unsqueeze(0)  # [1, N]
-        mask_indices = indices >= effective_sequence_length.unsqueeze(1)  # [B, N]
-        attention_mask = attention_mask.masked_fill(mask_indices, False)
-        attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, N]
-        # Context Parallel
-        if self.sp_world_size > 1:
-            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if image_rotary_emb is not None:
-                image_rotary_emb = (
-                    torch.chunk(image_rotary_emb[0], self.sp_world_size, dim=0)[self.sp_world_rank],
-                    torch.chunk(image_rotary_emb[1], self.sp_world_size, dim=0)[self.sp_world_rank]
-                )
-            if self.sp_world_rank >=1:
-                first_frame_num_tokens = 0
-        # 4. Transformer blocks
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for block in self.transformer_blocks:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    image_rotary_emb,
-                    token_replace_emb,
-                    first_frame_num_tokens,
-                    **ckpt_kwargs,
-                )
-            for block in self.single_transformer_blocks:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    image_rotary_emb,
-                    token_replace_emb,
-                    first_frame_num_tokens,
-                    **ckpt_kwargs,
-                )
-        else:
-            for block in self.transformer_blocks:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    image_rotary_emb,
-                    token_replace_emb,
-                    first_frame_num_tokens,
-                )
-            for block in self.single_transformer_blocks:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states,
-                    encoder_hidden_states,
-                    temb,
-                    attention_mask,
-                    image_rotary_emb,
-                    token_replace_emb,
-                    first_frame_num_tokens,
-                )
-        # 5. Output projection
-        hidden_states = self.norm_out(hidden_states, temb)
-        hidden_states = self.proj_out(hidden_states)
-        if self.sp_world_size > 1:
-            hidden_states = get_sp_group().all_gather(hidden_states, dim=1)
-        hidden_states = hidden_states.reshape(
-            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
-        )
-        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-        if not return_dict:
-            return (hidden_states,)
-        return Transformer2DModelOutput(sample=hidden_states)
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    print(model_files_safetensors)
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                filtered_state_dict = {}
-                for key in state_dict:
-                    if key in model.state_dict() and model.state_dict()[key].size() == state_dict[key].size():
-                        filtered_state_dict[key] = state_dict[key]
-                    else:
-                        print(f"Skipping key '{key}' due to size mismatch or absence in model.")
-                model_keys = set(model.state_dict().keys())
-                loaded_keys = set(filtered_state_dict.keys())
-                missing_keys = model_keys - loaded_keys
-                def initialize_missing_parameters(missing_keys, model_state_dict, torch_dtype=None):
-                    initialized_dict = {}
-                    with torch.no_grad():
-                        for key in missing_keys:
-                            param_shape = model_state_dict[key].shape
-                            param_dtype = torch_dtype if torch_dtype is not None else model_state_dict[key].dtype
-                            if 'weight' in key:
-                                if any(norm_type in key for norm_type in ['norm', 'ln_', 'layer_norm', 'group_norm', 'batch_norm']):
-                                    initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                                elif 'embedding' in key or 'embed' in key:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                                elif 'head' in key or 'output' in key or 'proj_out' in key:
-                                    initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                elif len(param_shape) >= 2:
-                                    initialized_dict[key] = torch.empty(param_shape, dtype=param_dtype)
-                                    nn.init.xavier_uniform_(initialized_dict[key])
-                                else:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                            elif 'bias' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_mean' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_var' in key:
-                                initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                            elif 'num_batches_tracked' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=torch.long)
-                            else:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                    return initialized_dict
-                if missing_keys:
-                    print(f"Missing keys will be initialized: {sorted(missing_keys)}")
-                    initialized_params = initialize_missing_parameters(
-                        missing_keys,
-                        model.state_dict(),
-                        torch_dtype
-                    )
-                    filtered_state_dict.update(initialized_params)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    model._convert_deprecated_attention_blocks(filtered_state_dict)
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model

videox_fun/models/hunyuanvideo_vae.py DELETED Viewed

@@ -1,1082 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py
-# Copyright 2025 The Hunyuan Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.activations import get_activation
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention
-from diffusers.models.autoencoders.vae import (DecoderOutput,
-                                               DiagonalGaussianDistribution)
-from diffusers.models.embeddings import TimestepEmbedding, Timesteps
-from diffusers.models.modeling_outputs import (AutoencoderKLOutput,
-                                               Transformer2DModelOutput)
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def prepare_causal_attention_mask(
-    num_frames: int, height_width: int, dtype: torch.dtype, device: torch.device, batch_size: int = None
-) -> torch.Tensor:
-    indices = torch.arange(1, num_frames + 1, dtype=torch.int32, device=device)
-    indices_blocks = indices.repeat_interleave(height_width)
-    x, y = torch.meshgrid(indices_blocks, indices_blocks, indexing="xy")
-    mask = torch.where(x <= y, 0, -float("inf")).to(dtype=dtype)
-    if batch_size is not None:
-        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
-    return mask
-class HunyuanVideoCausalConv3d(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int, int]] = 3,
-        stride: Union[int, Tuple[int, int, int]] = 1,
-        padding: Union[int, Tuple[int, int, int]] = 0,
-        dilation: Union[int, Tuple[int, int, int]] = 1,
-        bias: bool = True,
-        pad_mode: str = "replicate",
-    ) -> None:
-        super().__init__()
-        kernel_size = (kernel_size, kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
-        self.pad_mode = pad_mode
-        self.time_causal_padding = (
-            kernel_size[0] // 2,
-            kernel_size[0] // 2,
-            kernel_size[1] // 2,
-            kernel_size[1] // 2,
-            kernel_size[2] - 1,
-            0,
-        )
-        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = F.pad(hidden_states, self.time_causal_padding, mode=self.pad_mode)
-        return self.conv(hidden_states)
-class HunyuanVideoUpsampleCausal3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: Optional[int] = None,
-        kernel_size: int = 3,
-        stride: int = 1,
-        bias: bool = True,
-        upsample_factor: Tuple[float, float, float] = (2, 2, 2),
-    ) -> None:
-        super().__init__()
-        out_channels = out_channels or in_channels
-        self.upsample_factor = upsample_factor
-        self.conv = HunyuanVideoCausalConv3d(in_channels, out_channels, kernel_size, stride, bias=bias)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_frames = hidden_states.size(2)
-        first_frame, other_frames = hidden_states.split((1, num_frames - 1), dim=2)
-        first_frame = F.interpolate(
-            first_frame.squeeze(2), scale_factor=self.upsample_factor[1:], mode="nearest"
-        ).unsqueeze(2)
-        if num_frames > 1:
-            # See: https://github.com/pytorch/pytorch/issues/81665
-            # Unless you have a version of pytorch where non-contiguous implementation of F.interpolate
-            # is fixed, this will raise either a runtime error, or fail silently with bad outputs.
-            # If you are encountering an error here, make sure to try running encoding/decoding with
-            # `vae.enable_tiling()` first. If that doesn't work, open an issue at:
-            # https://github.com/huggingface/diffusers/issues
-            other_frames = other_frames.contiguous()
-            other_frames = F.interpolate(other_frames, scale_factor=self.upsample_factor, mode="nearest")
-            hidden_states = torch.cat((first_frame, other_frames), dim=2)
-        else:
-            hidden_states = first_frame
-        hidden_states = self.conv(hidden_states)
-        return hidden_states
-class HunyuanVideoDownsampleCausal3D(nn.Module):
-    def __init__(
-        self,
-        channels: int,
-        out_channels: Optional[int] = None,
-        padding: int = 1,
-        kernel_size: int = 3,
-        bias: bool = True,
-        stride=2,
-    ) -> None:
-        super().__init__()
-        out_channels = out_channels or channels
-        self.conv = HunyuanVideoCausalConv3d(channels, out_channels, kernel_size, stride, padding, bias=bias)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.conv(hidden_states)
-        return hidden_states
-class HunyuanVideoResnetBlockCausal3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: Optional[int] = None,
-        dropout: float = 0.0,
-        groups: int = 32,
-        eps: float = 1e-6,
-        non_linearity: str = "swish",
-    ) -> None:
-        super().__init__()
-        out_channels = out_channels or in_channels
-        self.nonlinearity = get_activation(non_linearity)
-        self.norm1 = nn.GroupNorm(groups, in_channels, eps=eps, affine=True)
-        self.conv1 = HunyuanVideoCausalConv3d(in_channels, out_channels, 3, 1, 0)
-        self.norm2 = nn.GroupNorm(groups, out_channels, eps=eps, affine=True)
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = HunyuanVideoCausalConv3d(out_channels, out_channels, 3, 1, 0)
-        self.conv_shortcut = None
-        if in_channels != out_channels:
-            self.conv_shortcut = HunyuanVideoCausalConv3d(in_channels, out_channels, 1, 1, 0)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = hidden_states.contiguous()
-        residual = hidden_states
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv1(hidden_states)
-        hidden_states = self.norm2(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.conv2(hidden_states)
-        if self.conv_shortcut is not None:
-            residual = self.conv_shortcut(residual)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class HunyuanVideoMidBlock3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        add_attention: bool = True,
-        attention_head_dim: int = 1,
-    ) -> None:
-        super().__init__()
-        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-        self.add_attention = add_attention
-        # There is always at least one resnet
-        resnets = [
-            HunyuanVideoResnetBlockCausal3D(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                eps=resnet_eps,
-                groups=resnet_groups,
-                dropout=dropout,
-                non_linearity=resnet_act_fn,
-            )
-        ]
-        attentions = []
-        for _ in range(num_layers):
-            if self.add_attention:
-                attentions.append(
-                    Attention(
-                        in_channels,
-                        heads=in_channels // attention_head_dim,
-                        dim_head=attention_head_dim,
-                        eps=resnet_eps,
-                        norm_num_groups=resnet_groups,
-                        residual_connection=True,
-                        bias=True,
-                        upcast_softmax=True,
-                        _from_deprecated_attn_block=True,
-                    )
-                )
-            else:
-                attentions.append(None)
-            resnets.append(
-                HunyuanVideoResnetBlockCausal3D(
-                    in_channels=in_channels,
-                    out_channels=in_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    non_linearity=resnet_act_fn,
-                )
-            )
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        self.gradient_checkpointing = False
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            hidden_states = self._gradient_checkpointing_func(self.resnets[0], hidden_states)
-            for attn, resnet in zip(self.attentions, self.resnets[1:]):
-                if attn is not None:
-                    batch_size, num_channels, num_frames, height, width = hidden_states.shape
-                    hidden_states = hidden_states.permute(0, 2, 3, 4, 1).flatten(1, 3)
-                    attention_mask = prepare_causal_attention_mask(
-                        num_frames, height * width, hidden_states.dtype, hidden_states.device, batch_size=batch_size
-                    )
-                    hidden_states = attn(hidden_states, attention_mask=attention_mask)
-                    hidden_states = hidden_states.unflatten(1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
-        else:
-            hidden_states = self.resnets[0](hidden_states)
-            for attn, resnet in zip(self.attentions, self.resnets[1:]):
-                if attn is not None:
-                    batch_size, num_channels, num_frames, height, width = hidden_states.shape
-                    hidden_states = hidden_states.permute(0, 2, 3, 4, 1).flatten(1, 3)
-                    attention_mask = prepare_causal_attention_mask(
-                        num_frames, height * width, hidden_states.dtype, hidden_states.device, batch_size=batch_size
-                    )
-                    hidden_states = attn(hidden_states, attention_mask=attention_mask)
-                    hidden_states = hidden_states.unflatten(1, (num_frames, height, width)).permute(0, 4, 1, 2, 3)
-                hidden_states = resnet(hidden_states)
-        return hidden_states
-class HunyuanVideoDownBlock3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        add_downsample: bool = True,
-        downsample_stride: int = 2,
-        downsample_padding: int = 1,
-    ) -> None:
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            in_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                HunyuanVideoResnetBlockCausal3D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    non_linearity=resnet_act_fn,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_downsample:
-            self.downsamplers = nn.ModuleList(
-                [
-                    HunyuanVideoDownsampleCausal3D(
-                        out_channels,
-                        out_channels=out_channels,
-                        padding=downsample_padding,
-                        stride=downsample_stride,
-                    )
-                ]
-            )
-        else:
-            self.downsamplers = None
-        self.gradient_checkpointing = False
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for resnet in self.resnets:
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
-        else:
-            for resnet in self.resnets:
-                hidden_states = resnet(hidden_states)
-        if self.downsamplers is not None:
-            for downsampler in self.downsamplers:
-                hidden_states = downsampler(hidden_states)
-        return hidden_states
-class HunyuanVideoUpBlock3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dropout: float = 0.0,
-        num_layers: int = 1,
-        resnet_eps: float = 1e-6,
-        resnet_act_fn: str = "swish",
-        resnet_groups: int = 32,
-        add_upsample: bool = True,
-        upsample_scale_factor: Tuple[int, int, int] = (2, 2, 2),
-    ) -> None:
-        super().__init__()
-        resnets = []
-        for i in range(num_layers):
-            input_channels = in_channels if i == 0 else out_channels
-            resnets.append(
-                HunyuanVideoResnetBlockCausal3D(
-                    in_channels=input_channels,
-                    out_channels=out_channels,
-                    eps=resnet_eps,
-                    groups=resnet_groups,
-                    dropout=dropout,
-                    non_linearity=resnet_act_fn,
-                )
-            )
-        self.resnets = nn.ModuleList(resnets)
-        if add_upsample:
-            self.upsamplers = nn.ModuleList(
-                [
-                    HunyuanVideoUpsampleCausal3D(
-                        out_channels,
-                        out_channels=out_channels,
-                        upsample_factor=upsample_scale_factor,
-                    )
-                ]
-            )
-        else:
-            self.upsamplers = None
-        self.gradient_checkpointing = False
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for resnet in self.resnets:
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
-        else:
-            for resnet in self.resnets:
-                hidden_states = resnet(hidden_states)
-        if self.upsamplers is not None:
-            for upsampler in self.upsamplers:
-                hidden_states = upsampler(hidden_states)
-        return hidden_states
-class HunyuanVideoEncoder3D(nn.Module):
-    r"""
-    Causal encoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
-    """
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str, ...] = (
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-        ),
-        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
-        layers_per_block: int = 2,
-        norm_num_groups: int = 32,
-        act_fn: str = "silu",
-        double_z: bool = True,
-        mid_block_add_attention=True,
-        temporal_compression_ratio: int = 4,
-        spatial_compression_ratio: int = 8,
-    ) -> None:
-        super().__init__()
-        self.conv_in = HunyuanVideoCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, stride=1)
-        self.mid_block = None
-        self.down_blocks = nn.ModuleList([])
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            if down_block_type != "HunyuanVideoDownBlock3D":
-                raise ValueError(f"Unsupported down_block_type: {down_block_type}")
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
-            num_time_downsample_layers = int(np.log2(temporal_compression_ratio))
-            if temporal_compression_ratio == 4:
-                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
-                add_time_downsample = bool(
-                    i >= (len(block_out_channels) - 1 - num_time_downsample_layers) and not is_final_block
-                )
-            elif temporal_compression_ratio == 8:
-                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
-                add_time_downsample = bool(i < num_time_downsample_layers)
-            else:
-                raise ValueError(f"Unsupported time_compression_ratio: {temporal_compression_ratio}")
-            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
-            downsample_stride_T = (2,) if add_time_downsample else (1,)
-            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
-            down_block = HunyuanVideoDownBlock3D(
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                add_downsample=bool(add_spatial_downsample or add_time_downsample),
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                downsample_stride=downsample_stride,
-                downsample_padding=0,
-            )
-            self.down_blocks.append(down_block)
-        self.mid_block = HunyuanVideoMidBlock3D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            attention_head_dim=block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            add_attention=mid_block_add_attention,
-        )
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = HunyuanVideoCausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
-        self.gradient_checkpointing = False
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.conv_in(hidden_states)
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for down_block in self.down_blocks:
-                hidden_states = self._gradient_checkpointing_func(down_block, hidden_states)
-            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
-        else:
-            for down_block in self.down_blocks:
-                hidden_states = down_block(hidden_states)
-            hidden_states = self.mid_block(hidden_states)
-        hidden_states = self.conv_norm_out(hidden_states)
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-class HunyuanVideoDecoder3D(nn.Module):
-    r"""
-    Causal decoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
-    """
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        up_block_types: Tuple[str, ...] = (
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-        ),
-        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
-        layers_per_block: int = 2,
-        norm_num_groups: int = 32,
-        act_fn: str = "silu",
-        mid_block_add_attention=True,
-        time_compression_ratio: int = 4,
-        spatial_compression_ratio: int = 8,
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-        self.conv_in = HunyuanVideoCausalConv3d(in_channels, block_out_channels[-1], kernel_size=3, stride=1)
-        self.up_blocks = nn.ModuleList([])
-        # mid
-        self.mid_block = HunyuanVideoMidBlock3D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            attention_head_dim=block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            add_attention=mid_block_add_attention,
-        )
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            if up_block_type != "HunyuanVideoUpBlock3D":
-                raise ValueError(f"Unsupported up_block_type: {up_block_type}")
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
-            num_time_upsample_layers = int(np.log2(time_compression_ratio))
-            if time_compression_ratio == 4:
-                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
-                add_time_upsample = bool(
-                    i >= len(block_out_channels) - 1 - num_time_upsample_layers and not is_final_block
-                )
-            else:
-                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
-            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
-            upsample_scale_factor_T = (2,) if add_time_upsample else (1,)
-            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
-            up_block = HunyuanVideoUpBlock3D(
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                add_upsample=bool(add_spatial_upsample or add_time_upsample),
-                upsample_scale_factor=upsample_scale_factor,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = HunyuanVideoCausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
-        self.gradient_checkpointing = False
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.conv_in(hidden_states)
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            hidden_states = self._gradient_checkpointing_func(self.mid_block, hidden_states)
-            for up_block in self.up_blocks:
-                hidden_states = self._gradient_checkpointing_func(up_block, hidden_states)
-        else:
-            hidden_states = self.mid_block(hidden_states)
-            for up_block in self.up_blocks:
-                hidden_states = up_block(hidden_states)
-        # post-process
-        hidden_states = self.conv_norm_out(hidden_states)
-        hidden_states = self.conv_act(hidden_states)
-        hidden_states = self.conv_out(hidden_states)
-        return hidden_states
-class AutoencoderKLHunyuanVideo(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
-    Introduced in [HunyuanVideo](https://huggingface.co/papers/2412.03603).
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        latent_channels: int = 16,
-        down_block_types: Tuple[str, ...] = (
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-            "HunyuanVideoDownBlock3D",
-        ),
-        up_block_types: Tuple[str, ...] = (
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-            "HunyuanVideoUpBlock3D",
-        ),
-        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
-        layers_per_block: int = 2,
-        act_fn: str = "silu",
-        norm_num_groups: int = 32,
-        scaling_factor: float = 0.476986,
-        spatial_compression_ratio: int = 8,
-        temporal_compression_ratio: int = 4,
-        mid_block_add_attention: bool = True,
-    ) -> None:
-        super().__init__()
-        self.time_compression_ratio = temporal_compression_ratio
-        self.encoder = HunyuanVideoEncoder3D(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-            double_z=True,
-            mid_block_add_attention=mid_block_add_attention,
-            temporal_compression_ratio=temporal_compression_ratio,
-            spatial_compression_ratio=spatial_compression_ratio,
-        )
-        self.decoder = HunyuanVideoDecoder3D(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-            time_compression_ratio=temporal_compression_ratio,
-            spatial_compression_ratio=spatial_compression_ratio,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
-        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
-        self.spatial_compression_ratio = spatial_compression_ratio
-        self.temporal_compression_ratio = temporal_compression_ratio
-        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
-        # to perform decoding of a single video latent at a time.
-        self.use_slicing = False
-        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
-        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
-        # intermediate tiles together, the memory requirement can be lowered.
-        self.use_tiling = True
-        # When decoding temporally long video latents, the memory requirement is very high. By decoding latent frames
-        # at a fixed frame batch size (based on `self.tile_sample_min_num_frames`), the memory requirement can be lowered.
-        self.use_framewise_encoding = True
-        self.use_framewise_decoding = True
-        # The minimal tile height and width for spatial tiling to be used
-        self.tile_sample_min_height = 256
-        self.tile_sample_min_width = 256
-        self.tile_sample_min_num_frames = 16
-        # The minimal distance between two spatial tiles
-        self.tile_sample_stride_height = 192
-        self.tile_sample_stride_width = 192
-        self.tile_sample_stride_num_frames = 12
-    def enable_tiling(
-        self,
-        tile_sample_min_height: Optional[int] = None,
-        tile_sample_min_width: Optional[int] = None,
-        tile_sample_min_num_frames: Optional[int] = None,
-        tile_sample_stride_height: Optional[float] = None,
-        tile_sample_stride_width: Optional[float] = None,
-        tile_sample_stride_num_frames: Optional[float] = None,
-    ) -> None:
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        Args:
-            tile_sample_min_height (`int`, *optional*):
-                The minimum height required for a sample to be separated into tiles across the height dimension.
-            tile_sample_min_width (`int`, *optional*):
-                The minimum width required for a sample to be separated into tiles across the width dimension.
-            tile_sample_min_num_frames (`int`, *optional*):
-                The minimum number of frames required for a sample to be separated into tiles across the frame
-                dimension.
-            tile_sample_stride_height (`int`, *optional*):
-                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
-                no tiling artifacts produced across the height dimension.
-            tile_sample_stride_width (`int`, *optional*):
-                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
-                artifacts produced across the width dimension.
-            tile_sample_stride_num_frames (`int`, *optional*):
-                The stride between two consecutive frame tiles. This is to ensure that there are no tiling artifacts
-                produced across the frame dimension.
-        """
-        self.use_tiling = True
-        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
-        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
-        self.tile_sample_min_num_frames = tile_sample_min_num_frames or self.tile_sample_min_num_frames
-        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
-        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
-        self.tile_sample_stride_num_frames = tile_sample_stride_num_frames or self.tile_sample_stride_num_frames
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = x.shape
-        if self.use_framewise_encoding and num_frames > self.tile_sample_min_num_frames:
-            return self._temporal_tiled_encode(x)
-        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
-            return self.tiled_encode(x)
-        x = self.encoder(x)
-        enc = self.quant_conv(x)
-        return enc
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        r"""
-        Encode a batch of images into latents.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-                The latent representations of the encoded videos. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = z.shape
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
-        if self.use_framewise_decoding and num_frames > tile_latent_min_num_frames:
-            return self._temporal_tiled_decode(z, return_dict=return_dict)
-        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
-            return self.tiled_decode(z, return_dict=return_dict)
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
-                y / blend_extent
-            )
-        return b
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
-                x / blend_extent
-            )
-        return b
-    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, x, :, :] = a[:, :, -blend_extent + x, :, :] * (1 - x / blend_extent) + b[:, :, x, :, :] * (
-                x / blend_extent
-            )
-        return b
-    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-        Args:
-            x (`torch.Tensor`): Input batch of videos.
-        Returns:
-            `torch.Tensor`:
-                The latent representation of the encoded videos.
-        """
-        batch_size, num_channels, num_frames, height, width = x.shape
-        latent_height = height // self.spatial_compression_ratio
-        latent_width = width // self.spatial_compression_ratio
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
-        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
-        blend_height = tile_latent_min_height - tile_latent_stride_height
-        blend_width = tile_latent_min_width - tile_latent_stride_width
-        # Split x into overlapping tiles and encode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, self.tile_sample_stride_height):
-            row = []
-            for j in range(0, width, self.tile_sample_stride_width):
-                tile = x[:, :, :, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
-                tile = self.encoder(tile)
-                tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_width)
-                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
-            result_rows.append(torch.cat(result_row, dim=4))
-        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
-        return enc
-    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        batch_size, num_channels, num_frames, height, width = z.shape
-        sample_height = height * self.spatial_compression_ratio
-        sample_width = width * self.spatial_compression_ratio
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
-        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
-        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
-        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
-        # Split z into overlapping tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, tile_latent_stride_height):
-            row = []
-            for j in range(0, width, tile_latent_stride_width):
-                tile = z[:, :, :, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
-                tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-                row.append(decoded)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_width)
-                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
-            result_rows.append(torch.cat(result_row, dim=-1))
-        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def _temporal_tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
-        batch_size, num_channels, num_frames, height, width = x.shape
-        latent_num_frames = (num_frames - 1) // self.temporal_compression_ratio + 1
-        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
-        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
-        blend_num_frames = tile_latent_min_num_frames - tile_latent_stride_num_frames
-        row = []
-        for i in range(0, num_frames, self.tile_sample_stride_num_frames):
-            tile = x[:, :, i : i + self.tile_sample_min_num_frames + 1, :, :]
-            if self.use_tiling and (height > self.tile_sample_min_height or width > self.tile_sample_min_width):
-                tile = self.tiled_encode(tile)
-            else:
-                tile = self.encoder(tile)
-                tile = self.quant_conv(tile)
-            if i > 0:
-                tile = tile[:, :, 1:, :, :]
-            row.append(tile)
-        result_row = []
-        for i, tile in enumerate(row):
-            if i > 0:
-                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
-                result_row.append(tile[:, :, :tile_latent_stride_num_frames, :, :])
-            else:
-                result_row.append(tile[:, :, : tile_latent_stride_num_frames + 1, :, :])
-        enc = torch.cat(result_row, dim=2)[:, :, :latent_num_frames]
-        return enc
-    def _temporal_tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = z.shape
-        num_sample_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_min_num_frames = self.tile_sample_min_num_frames // self.temporal_compression_ratio
-        tile_latent_stride_num_frames = self.tile_sample_stride_num_frames // self.temporal_compression_ratio
-        blend_num_frames = self.tile_sample_min_num_frames - self.tile_sample_stride_num_frames
-        row = []
-        for i in range(0, num_frames, tile_latent_stride_num_frames):
-            tile = z[:, :, i : i + tile_latent_min_num_frames + 1, :, :]
-            if self.use_tiling and (tile.shape[-1] > tile_latent_min_width or tile.shape[-2] > tile_latent_min_height):
-                decoded = self.tiled_decode(tile, return_dict=True).sample
-            else:
-                tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-            if i > 0:
-                decoded = decoded[:, :, 1:, :, :]
-            row.append(decoded)
-        result_row = []
-        for i, tile in enumerate(row):
-            if i > 0:
-                tile = self.blend_t(row[i - 1], tile, blend_num_frames)
-                result_row.append(tile[:, :, : self.tile_sample_stride_num_frames, :, :])
-            else:
-                result_row.append(tile[:, :, : self.tile_sample_stride_num_frames + 1, :, :])
-        dec = torch.cat(result_row, dim=2)[:, :, :num_sample_frames]
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z, return_dict=return_dict)
-        return dec

videox_fun/models/qwenimage_transformer2d.py DELETED Viewed

@@ -1,1118 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/transformer_qwenimage.py
-# Copyright 2025 Qwen-Image Team, The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import inspect
-import glob
-import json
-import math
-import os
-import types
-import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.attention import Attention, FeedForward
-from diffusers.models.attention_processor import (
-    Attention, AttentionProcessor, CogVideoXAttnProcessor2_0,
-    FusedCogVideoXAttnProcessor2_0)
-from diffusers.models.embeddings import (CogVideoXPatchEmbed,
-                                         TimestepEmbedding, Timesteps,
-                                         get_3d_sincos_pos_embed)
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import (AdaLayerNorm,
-                                            AdaLayerNormContinuous,
-                                            CogVideoXLayerNormZero, RMSNorm)
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from torch import nn
-from ..dist import (QwenImageMultiGPUsAttnProcessor2_0,
-                    get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group)
-from .attention_utils import attention
-from .cache_utils import TeaCache
-from ..utils import cfg_skip
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-def get_timestep_embedding(
-    timesteps: torch.Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-) -> torch.Tensor:
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-    Args
-        timesteps (torch.Tensor):
-            a 1-D Tensor of N indices, one per batch element. These may be fractional.
-        embedding_dim (int):
-            the dimension of the output.
-        flip_sin_to_cos (bool):
-            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
-        downscale_freq_shift (float):
-            Controls the delta between frequencies between dimensions
-        scale (float):
-            Scaling factor applied to the embeddings.
-        max_period (int):
-            Controls the maximum frequency of the embeddings
-    Returns
-        torch.Tensor: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
-    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(
-        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
-    )
-    exponent = exponent / (half_dim - downscale_freq_shift)
-    emb = torch.exp(exponent).to(timesteps.dtype)
-    emb = timesteps[:, None].float() * emb[None, :]
-    # scale embeddings
-    emb = scale * emb
-    # concat sine and cosine embeddings
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
-    # flip sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
-    # zero pad
-    if embedding_dim % 2 == 1:
-        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
-    return emb
-def apply_rotary_emb_qwen(
-    x: torch.Tensor,
-    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
-    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
-    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
-    tensors contain rotary embeddings and are returned as real tensors.
-    Args:
-        x (`torch.Tensor`):
-            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
-        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    if use_real:
-        cos, sin = freqs_cis  # [S, D]
-        cos = cos[None, None]
-        sin = sin[None, None]
-        cos, sin = cos.to(x.device), sin.to(x.device)
-        if use_real_unbind_dim == -1:
-            # Used for flux, cogvideox, hunyuan-dit
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-        return out
-    else:
-        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
-        freqs_cis = freqs_cis.unsqueeze(1)
-        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
-        return x_out.type_as(x)
-class QwenTimestepProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-    def forward(self, timestep, hidden_states):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
-        conditioning = timesteps_emb
-        return conditioning
-class QwenEmbedRope(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-        pos_index = torch.arange(4096)
-        neg_index = torch.arange(4096).flip(0) * -1 - 1
-        self.pos_freqs = torch.cat(
-            [
-                self.rope_params(pos_index, self.axes_dim[0], self.theta),
-                self.rope_params(pos_index, self.axes_dim[1], self.theta),
-                self.rope_params(pos_index, self.axes_dim[2], self.theta),
-            ],
-            dim=1,
-        )
-        self.neg_freqs = torch.cat(
-            [
-                self.rope_params(neg_index, self.axes_dim[0], self.theta),
-                self.rope_params(neg_index, self.axes_dim[1], self.theta),
-                self.rope_params(neg_index, self.axes_dim[2], self.theta),
-            ],
-            dim=1,
-        )
-        self.rope_cache = {}
-        # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
-        self.scale_rope = scale_rope
-    def rope_params(self, index, dim, theta=10000):
-        """
-        Args:
-            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
-        """
-        assert dim % 2 == 0
-        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
-        freqs = torch.polar(torch.ones_like(freqs), freqs)
-        return freqs
-    def forward(self, video_fhw, txt_seq_lens, device):
-        """
-        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
-        txt_length: [bs] a list of 1 integers representing the length of the text
-        """
-        if self.pos_freqs.device != device:
-            self.pos_freqs = self.pos_freqs.to(device)
-            self.neg_freqs = self.neg_freqs.to(device)
-        if isinstance(video_fhw, list):
-            video_fhw = video_fhw[0]
-        if not isinstance(video_fhw, list):
-            video_fhw = [video_fhw]
-        vid_freqs = []
-        max_vid_index = 0
-        for idx, fhw in enumerate(video_fhw):
-            frame, height, width = fhw
-            rope_key = f"{idx}_{frame}_{height}_{width}"
-            if not torch.compiler.is_compiling():
-                if rope_key not in self.rope_cache:
-                    self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width, idx)
-                video_freq = self.rope_cache[rope_key]
-            else:
-                video_freq = self._compute_video_freqs(frame, height, width, idx)
-            video_freq = video_freq.to(device)
-            vid_freqs.append(video_freq)
-            if self.scale_rope:
-                max_vid_index = max(height // 2, width // 2, max_vid_index)
-            else:
-                max_vid_index = max(height, width, max_vid_index)
-        max_len = max(txt_seq_lens)
-        txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
-        vid_freqs = torch.cat(vid_freqs, dim=0)
-        return vid_freqs, txt_freqs
-    @functools.lru_cache(maxsize=None)
-    def _compute_video_freqs(self, frame, height, width, idx=0):
-        seq_lens = frame * height * width
-        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
-        if self.scale_rope:
-            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
-        else:
-            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-        return freqs.clone().contiguous()
-class QwenDoubleStreamAttnProcessor2_0:
-    """
-    Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
-    implements joint attention computation where text and image streams are processed together.
-    """
-    _attention_backend = None
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "QwenDoubleStreamAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,  # Image stream
-        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
-        encoder_hidden_states_mask: torch.FloatTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        if encoder_hidden_states is None:
-            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
-        seq_txt = encoder_hidden_states.shape[1]
-        # Compute QKV for image stream (sample projections)
-        img_query = attn.to_q(hidden_states)
-        img_key = attn.to_k(hidden_states)
-        img_value = attn.to_v(hidden_states)
-        # Compute QKV for text stream (context projections)
-        txt_query = attn.add_q_proj(encoder_hidden_states)
-        txt_key = attn.add_k_proj(encoder_hidden_states)
-        txt_value = attn.add_v_proj(encoder_hidden_states)
-        # Reshape for multi-head attention
-        img_query = img_query.unflatten(-1, (attn.heads, -1))
-        img_key = img_key.unflatten(-1, (attn.heads, -1))
-        img_value = img_value.unflatten(-1, (attn.heads, -1))
-        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
-        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
-        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
-        # Apply QK normalization
-        if attn.norm_q is not None:
-            img_query = attn.norm_q(img_query)
-        if attn.norm_k is not None:
-            img_key = attn.norm_k(img_key)
-        if attn.norm_added_q is not None:
-            txt_query = attn.norm_added_q(txt_query)
-        if attn.norm_added_k is not None:
-            txt_key = attn.norm_added_k(txt_key)
-        # Apply RoPE
-        if image_rotary_emb is not None:
-            img_freqs, txt_freqs = image_rotary_emb
-            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
-            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
-            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
-            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
-        # Concatenate for joint attention
-        # Order: [text, image]
-        joint_query = torch.cat([txt_query, img_query], dim=1)
-        joint_key = torch.cat([txt_key, img_key], dim=1)
-        joint_value = torch.cat([txt_value, img_value], dim=1)
-        joint_hidden_states = attention(
-            joint_query, joint_key, joint_value, attn_mask=attention_mask, dropout_p=0.0, causal=False
-        )
-        # Reshape back
-        joint_hidden_states = joint_hidden_states.flatten(2, 3)
-        joint_hidden_states = joint_hidden_states.to(joint_query.dtype)
-        # Split attention outputs back
-        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
-        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
-        # Apply output projections
-        img_attn_output = attn.to_out[0](img_attn_output)
-        if len(attn.to_out) > 1:
-            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
-        txt_attn_output = attn.to_add_out(txt_attn_output)
-        return img_attn_output, txt_attn_output
-@maybe_allow_in_graph
-class QwenImageTransformerBlock(nn.Module):
-    def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
-    ):
-        super().__init__()
-        self.dim = dim
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        # Image processing modules
-        self.img_mod = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
-        )
-        self.img_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.attn = Attention(
-            query_dim=dim,
-            cross_attention_dim=None,  # Enable cross attention for joint computation
-            added_kv_proj_dim=dim,  # Enable added KV projections for text stream
-            dim_head=attention_head_dim,
-            heads=num_attention_heads,
-            out_dim=dim,
-            context_pre_only=False,
-            bias=True,
-            processor=QwenDoubleStreamAttnProcessor2_0(),
-            qk_norm=qk_norm,
-            eps=eps,
-        )
-        self.img_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.img_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-        # Text processing modules
-        self.txt_mod = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
-        )
-        self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        # Text doesn't need separate attention - it's handled by img_attn joint computation
-        self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
-        self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
-    def _modulate(self, x, mod_params):
-        """Apply modulation to input tensor"""
-        shift, scale, gate = mod_params.chunk(3, dim=-1)
-        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_hidden_states_mask: torch.Tensor,
-        temb: torch.Tensor,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Get modulation parameters for both streams
-        img_mod_params = self.img_mod(temb)  # [B, 6*dim]
-        txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
-        # Split modulation parameters for norm1 and norm2
-        img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
-        txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
-        # Process image stream - norm1 + modulation
-        img_normed = self.img_norm1(hidden_states)
-        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
-        # Process text stream - norm1 + modulation
-        txt_normed = self.txt_norm1(encoder_hidden_states)
-        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
-        # Use QwenAttnProcessor2_0 for joint attention computation
-        # This directly implements the DoubleStreamLayerMegatron logic:
-        # 1. Computes QKV for both streams
-        # 2. Applies QK normalization and RoPE
-        # 3. Concatenates and runs joint attention
-        # 4. Splits results back to separate streams
-        joint_attention_kwargs = joint_attention_kwargs or {}
-        attn_output = self.attn(
-            hidden_states=img_modulated,  # Image stream (will be processed as "sample")
-            encoder_hidden_states=txt_modulated,  # Text stream (will be processed as "context")
-            encoder_hidden_states_mask=encoder_hidden_states_mask,
-            image_rotary_emb=image_rotary_emb,
-            **joint_attention_kwargs,
-        )
-        # QwenAttnProcessor2_0 returns (img_output, txt_output) when encoder_hidden_states is provided
-        img_attn_output, txt_attn_output = attn_output
-        # Apply attention gates and add residual (like in Megatron)
-        hidden_states = hidden_states + img_gate1 * img_attn_output
-        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
-        # Process image stream - norm2 + MLP
-        img_normed2 = self.img_norm2(hidden_states)
-        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
-        img_mlp_output = self.img_mlp(img_modulated2)
-        hidden_states = hidden_states + img_gate2 * img_mlp_output
-        # Process text stream - norm2 + MLP
-        txt_normed2 = self.txt_norm2(encoder_hidden_states)
-        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
-        txt_mlp_output = self.txt_mlp(txt_modulated2)
-        encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output
-        # Clip to prevent overflow for fp16
-        if encoder_hidden_states.dtype == torch.float16:
-            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
-        if hidden_states.dtype == torch.float16:
-            hidden_states = hidden_states.clip(-65504, 65504)
-        return encoder_hidden_states, hidden_states
-class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
-    """
-    The Transformer model introduced in Qwen.
-    Args:
-        patch_size (`int`, defaults to `2`):
-            Patch size to turn the input data into small patches.
-        in_channels (`int`, defaults to `64`):
-            The number of channels in the input.
-        out_channels (`int`, *optional*, defaults to `None`):
-            The number of channels in the output. If not specified, it defaults to `in_channels`.
-        num_layers (`int`, defaults to `60`):
-            The number of layers of dual stream DiT blocks to use.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of dimensions to use for each attention head.
-        num_attention_heads (`int`, defaults to `24`):
-            The number of attention heads to use.
-        joint_attention_dim (`int`, defaults to `3584`):
-            The number of dimensions to use for the joint attention (embedding/channel dimension of
-            `encoder_hidden_states`).
-        guidance_embeds (`bool`, defaults to `False`):
-            Whether to use guidance embeddings for guidance-distilled variant of the model.
-        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
-            The dimensions to use for the rotary positional embeddings.
-    """
-    # _supports_gradient_checkpointing = True
-    # _no_split_modules = ["QwenImageTransformerBlock"]
-    # _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
-    # _repeated_blocks = ["QwenImageTransformerBlock"]
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        patch_size: int = 2,
-        in_channels: int = 64,
-        out_channels: Optional[int] = 16,
-        num_layers: int = 60,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 24,
-        joint_attention_dim: int = 3584,
-        guidance_embeds: bool = False,  # TODO: this should probably be removed
-        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
-    ):
-        super().__init__()
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
-        self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
-        self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)
-        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
-        self.img_in = nn.Linear(in_channels, self.inner_dim)
-        self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim)
-        self.transformer_blocks = nn.ModuleList(
-            [
-                QwenImageTransformerBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
-        self.teacache = None
-        self.cfg_skip_ratio = None
-        self.current_steps = 0
-        self.num_inference_steps = None
-        self.gradient_checkpointing = False
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        self.set_attn_processor(QwenImageMultiGPUsAttnProcessor2_0())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def enable_cfg_skip(self, cfg_skip_ratio, num_steps):
-        if cfg_skip_ratio != 0:
-            self.cfg_skip_ratio = cfg_skip_ratio
-            self.current_steps = 0
-            self.num_inference_steps = num_steps
-        else:
-            self.cfg_skip_ratio = None
-            self.current_steps = 0
-            self.num_inference_steps = None
-    def share_cfg_skip(
-        self,
-        transformer = None,
-    ):
-        self.cfg_skip_ratio = transformer.cfg_skip_ratio
-        self.current_steps = transformer.current_steps
-        self.num_inference_steps = transformer.num_inference_steps
-    def disable_cfg_skip(self):
-        self.cfg_skip_ratio = None
-        self.current_steps = 0
-        self.num_inference_steps = None
-    def enable_teacache(
-        self,
-        coefficients,
-        num_steps: int,
-        rel_l1_thresh: float,
-        num_skip_start_steps: int = 0,
-        offload: bool = True,
-    ):
-        self.teacache = TeaCache(
-            coefficients, num_steps, rel_l1_thresh=rel_l1_thresh, num_skip_start_steps=num_skip_start_steps, offload=offload
-        )
-    def share_teacache(
-        self,
-        transformer = None,
-    ):
-        self.teacache = transformer.teacache
-    def disable_teacache(self):
-        self.teacache = None
-    @cfg_skip()
-    def forward_bs(self, x, *args, **kwargs):
-        func = self.forward
-        sig = inspect.signature(func)
-        bs          = len(x)
-        bs_half     = int(bs // 2)
-        if bs >= 2:
-            # cond
-            x_i = x[bs_half:]
-            args_i = [
-                arg[bs_half:] if
-                isinstance(arg,
-                            (torch.Tensor, list, tuple, np.ndarray)) and
-                len(arg) == bs else arg for arg in args
-            ]
-            kwargs_i = {
-                k: (v[bs_half:] if
-                isinstance(v,
-                    (torch.Tensor, list, tuple,
-                    np.ndarray)) and len(v) == bs else v
-                ) for k, v in kwargs.items()
-            }
-            if 'cond_flag' in sig.parameters:
-                kwargs_i["cond_flag"] = True
-            cond_out = func(x_i, *args_i, **kwargs_i)
-            # uncond
-            uncond_x_i = x[:bs_half]
-            uncond_args_i = [
-                arg[:bs_half] if
-                isinstance(arg,
-                            (torch.Tensor, list, tuple, np.ndarray)) and
-                len(arg) == bs else arg for arg in args
-            ]
-            uncond_kwargs_i = {
-                k: (v[:bs_half] if
-                    isinstance(v,
-                                (torch.Tensor, list, tuple,
-                                np.ndarray)) and len(v) == bs else v
-                    ) for k, v in kwargs.items()
-            }
-            if 'cond_flag' in sig.parameters:
-                uncond_kwargs_i["cond_flag"] = False
-            uncond_out = func(uncond_x_i, *uncond_args_i,
-                                **uncond_kwargs_i)
-            x = torch.cat([uncond_out, cond_out], dim=0)
-        else:
-            x = func(x, *args, **kwargs)
-        return x
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor = None,
-        encoder_hidden_states_mask: torch.Tensor = None,
-        timestep: torch.LongTensor = None,
-        img_shapes: Optional[List[Tuple[int, int, int]]] = None,
-        txt_seq_lens: Optional[List[int]] = None,
-        guidance: torch.Tensor = None,  # TODO: this should probably be removed
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        cond_flag: bool = True,
-        return_dict: bool = True,
-    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
-        """
-        The [`QwenTransformer2DModel`] forward method.
-        Args:
-            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
-                Input `hidden_states`.
-            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
-                Mask of the input conditions.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        if attention_kwargs is not None:
-            attention_kwargs = attention_kwargs.copy()
-            lora_scale = attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-        if isinstance(encoder_hidden_states, list):
-            encoder_hidden_states = torch.stack(encoder_hidden_states)
-            encoder_hidden_states_mask = torch.stack(encoder_hidden_states_mask)
-        hidden_states = self.img_in(hidden_states)
-        timestep = timestep.to(hidden_states.dtype)
-        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
-        encoder_hidden_states = self.txt_in(encoder_hidden_states)
-        if guidance is not None:
-            guidance = guidance.to(hidden_states.dtype) * 1000
-        temb = (
-            self.time_text_embed(timestep, hidden_states)
-            if guidance is None
-            else self.time_text_embed(timestep, guidance, hidden_states)
-        )
-        image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=hidden_states.device)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if image_rotary_emb is not None:
-                image_rotary_emb = (
-                    torch.chunk(image_rotary_emb[0], self.sp_world_size, dim=0)[self.sp_world_rank],
-                    image_rotary_emb[1]
-                )
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                inp = hidden_states.clone()
-                temb_ = temb.clone()
-                encoder_hidden_states_ = encoder_hidden_states.clone()
-                img_mod_params_ = self.transformer_blocks[0].img_mod(temb_)
-                img_mod1_, img_mod2_ = img_mod_params_.chunk(2, dim=-1)
-                img_normed_ = self.transformer_blocks[0].img_norm1(inp)
-                modulated_inp, img_gate1_ = self.transformer_blocks[0]._modulate(img_normed_, img_mod1_)
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                hidden_states = hidden_states + previous_residual.to(hidden_states.device)[-hidden_states.size()[0]:,]
-            else:
-                ori_hidden_states = hidden_states.clone().cpu() if self.teacache.offload else hidden_states.clone()
-                # 4. Transformer blocks
-                for i, block in enumerate(self.transformer_blocks):
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module):
-                            def custom_forward(*inputs):
-                                return module(*inputs)
-                            return custom_forward
-                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block),
-                            hidden_states,
-                            encoder_hidden_states,
-                            encoder_hidden_states_mask,
-                            temb,
-                            image_rotary_emb,
-                            **ckpt_kwargs,
-                        )
-                    else:
-                        encoder_hidden_states, hidden_states = block(
-                            hidden_states=hidden_states,
-                            encoder_hidden_states=encoder_hidden_states,
-                            encoder_hidden_states_mask=encoder_hidden_states_mask,
-                            temb=temb,
-                            image_rotary_emb=image_rotary_emb,
-                            joint_attention_kwargs=attention_kwargs,
-                        )
-                if cond_flag:
-                    self.teacache.previous_residual_cond = hidden_states.cpu() - ori_hidden_states if self.teacache.offload else hidden_states - ori_hidden_states
-                else:
-                    self.teacache.previous_residual_uncond = hidden_states.cpu() - ori_hidden_states if self.teacache.offload else hidden_states - ori_hidden_states
-                del ori_hidden_states
-        else:
-            for index_block, block in enumerate(self.transformer_blocks):
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs)
-                        return custom_forward
-                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        hidden_states,
-                        encoder_hidden_states,
-                        encoder_hidden_states_mask,
-                        temb,
-                        image_rotary_emb,
-                        **ckpt_kwargs,
-                    )
-                else:
-                    encoder_hidden_states, hidden_states = block(
-                        hidden_states=hidden_states,
-                        encoder_hidden_states=encoder_hidden_states,
-                        encoder_hidden_states_mask=encoder_hidden_states_mask,
-                        temb=temb,
-                        image_rotary_emb=image_rotary_emb,
-                        joint_attention_kwargs=attention_kwargs,
-                    )
-        # Use only the image part (hidden_states) from the dual-stream blocks
-        hidden_states = self.norm_out(hidden_states, temb)
-        output = self.proj_out(hidden_states)
-        if self.sp_world_size > 1:
-            output = self.all_gather(output, dim=1)
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-        if self.teacache is not None and cond_flag:
-            self.teacache.cnt += 1
-            if self.teacache.cnt == self.teacache.num_steps:
-                self.teacache.reset()
-        return output
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    print(model_files_safetensors)
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                filtered_state_dict = {}
-                for key in state_dict:
-                    if key in model.state_dict() and model.state_dict()[key].size() == state_dict[key].size():
-                        filtered_state_dict[key] = state_dict[key]
-                    else:
-                        print(f"Skipping key '{key}' due to size mismatch or absence in model.")
-                model_keys = set(model.state_dict().keys())
-                loaded_keys = set(filtered_state_dict.keys())
-                missing_keys = model_keys - loaded_keys
-                def initialize_missing_parameters(missing_keys, model_state_dict, torch_dtype=None):
-                    initialized_dict = {}
-                    with torch.no_grad():
-                        for key in missing_keys:
-                            param_shape = model_state_dict[key].shape
-                            param_dtype = torch_dtype if torch_dtype is not None else model_state_dict[key].dtype
-                            if 'weight' in key:
-                                if any(norm_type in key for norm_type in ['norm', 'ln_', 'layer_norm', 'group_norm', 'batch_norm']):
-                                    initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                                elif 'embedding' in key or 'embed' in key:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                                elif 'head' in key or 'output' in key or 'proj_out' in key:
-                                    initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                elif len(param_shape) >= 2:
-                                    initialized_dict[key] = torch.empty(param_shape, dtype=param_dtype)
-                                    nn.init.xavier_uniform_(initialized_dict[key])
-                                else:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                            elif 'bias' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_mean' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_var' in key:
-                                initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                            elif 'num_batches_tracked' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=torch.long)
-                            else:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                    return initialized_dict
-                if missing_keys:
-                    print(f"Missing keys will be initialized: {sorted(missing_keys)}")
-                    initialized_params = initialize_missing_parameters(
-                        missing_keys,
-                        model.state_dict(),
-                        torch_dtype
-                    )
-                    filtered_state_dict.update(initialized_params)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    model._convert_deprecated_attention_blocks(filtered_state_dict)
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model

videox_fun/models/qwenimage_vae.py DELETED Viewed

@@ -1,1087 +0,0 @@
-# Modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
-# Copyright 2025 The Qwen-Image Team, Wan Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# We gratefully acknowledge the Wan Team for their outstanding contributions.
-# QwenImageVAE is further fine-tuned from the Wan Video VAE to achieve improved performance.
-# For more information about the Wan VAE, please refer to:
-# - GitHub: https://github.com/Wan-Video/Wan2.1
-# - arXiv: https://arxiv.org/abs/2503.20314
-import functools
-import glob
-import json
-import math
-import os
-import types
-import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.activations import get_activation
-from diffusers.models.attention import FeedForward
-from diffusers.models.attention_processor import Attention
-from diffusers.models.autoencoders.vae import (DecoderOutput,
-                                               DiagonalGaussianDistribution)
-from diffusers.models.embeddings import TimestepEmbedding, Timesteps
-from diffusers.models.modeling_outputs import (AutoencoderKLOutput,
-                                               Transformer2DModelOutput)
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from torch import nn
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-CACHE_T = 2
-class QwenImageCausalConv3d(nn.Conv3d):
-    r"""
-    A custom 3D causal convolution layer with feature caching support.
-    This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
-    caching for efficient inference.
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the convolution
-        kernel_size (int or tuple): Size of the convolving kernel
-        stride (int or tuple, optional): Stride of the convolution. Default: 1
-        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: Union[int, Tuple[int, int, int]],
-        stride: Union[int, Tuple[int, int, int]] = 1,
-        padding: Union[int, Tuple[int, int, int]] = 0,
-    ) -> None:
-        super().__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-        )
-        # Set up causal padding
-        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-        return super().forward(x)
-class QwenImageRMS_norm(nn.Module):
-    r"""
-    A custom RMS normalization layer.
-    Args:
-        dim (int): The number of dimensions to normalize over.
-        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
-            Default is True.
-        images (bool, optional): Whether the input represents image data. Default is True.
-        bias (bool, optional): Whether to include a learnable bias term. Default is False.
-    """
-    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bias: bool = False) -> None:
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
-    def forward(self, x):
-        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
-class QwenImageUpsample(nn.Upsample):
-    r"""
-    Perform upsampling while ensuring the output tensor has the same data type as the input.
-    Args:
-        x (torch.Tensor): Input tensor to be upsampled.
-    Returns:
-        torch.Tensor: Upsampled tensor with the same data type as the input.
-    """
-    def forward(self, x):
-        return super().forward(x.float()).type_as(x)
-class QwenImageResample(nn.Module):
-    r"""
-    A custom resampling module for 2D and 3D data.
-    Args:
-        dim (int): The number of input/output channels.
-        mode (str): The resampling mode. Must be one of:
-            - 'none': No resampling (identity operation).
-            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
-            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
-            - 'downsample2d': 2D downsampling with zero-padding and convolution.
-            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
-    """
-    def __init__(self, dim: int, mode: str) -> None:
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-        # layers
-        if mode == "upsample2d":
-            self.resample = nn.Sequential(
-                QwenImageUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim // 2, 3, padding=1),
-            )
-        elif mode == "upsample3d":
-            self.resample = nn.Sequential(
-                QwenImageUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim // 2, 3, padding=1),
-            )
-            self.time_conv = QwenImageCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-        elif mode == "downsample2d":
-            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == "downsample3d":
-            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = QwenImageCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        else:
-            self.resample = nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == "upsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = "Rep"
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat(
-                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
-                        )
-                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
-                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
-                    if feat_cache[idx] == "Rep":
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        x = self.resample(x)
-        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
-        if self.mode == "downsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-class QwenImageResidualBlock(nn.Module):
-    r"""
-    A custom residual block module.
-    Args:
-        in_dim (int): Number of input channels.
-        out_dim (int): Number of output channels.
-        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
-        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
-    """
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        dropout: float = 0.0,
-        non_linearity: str = "silu",
-    ) -> None:
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.nonlinearity = get_activation(non_linearity)
-        # layers
-        self.norm1 = QwenImageRMS_norm(in_dim, images=False)
-        self.conv1 = QwenImageCausalConv3d(in_dim, out_dim, 3, padding=1)
-        self.norm2 = QwenImageRMS_norm(out_dim, images=False)
-        self.dropout = nn.Dropout(dropout)
-        self.conv2 = QwenImageCausalConv3d(out_dim, out_dim, 3, padding=1)
-        self.conv_shortcut = QwenImageCausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        # Apply shortcut connection
-        h = self.conv_shortcut(x)
-        # First normalization and activation
-        x = self.norm1(x)
-        x = self.nonlinearity(x)
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        # Second normalization and activation
-        x = self.norm2(x)
-        x = self.nonlinearity(x)
-        # Dropout
-        x = self.dropout(x)
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv2(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv2(x)
-        # Add residual connection
-        return x + h
-class QwenImageAttentionBlock(nn.Module):
-    r"""
-    Causal self-attention with a single head.
-    Args:
-        dim (int): The number of channels in the input tensor.
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        # layers
-        self.norm = QwenImageRMS_norm(dim)
-        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
-        self.proj = nn.Conv2d(dim, dim, 1)
-    def forward(self, x):
-        identity = x
-        batch_size, channels, time, height, width = x.size()
-        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * time, channels, height, width)
-        x = self.norm(x)
-        # compute query, key, value
-        qkv = self.to_qkv(x)
-        qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
-        qkv = qkv.permute(0, 1, 3, 2).contiguous()
-        q, k, v = qkv.chunk(3, dim=-1)
-        # apply attention
-        x = F.scaled_dot_product_attention(q, k, v)
-        x = x.squeeze(1).permute(0, 2, 1).reshape(batch_size * time, channels, height, width)
-        # output projection
-        x = self.proj(x)
-        # Reshape back: [(b*t), c, h, w] -> [b, c, t, h, w]
-        x = x.view(batch_size, time, channels, height, width)
-        x = x.permute(0, 2, 1, 3, 4)
-        return x + identity
-class QwenImageMidBlock(nn.Module):
-    """
-    Middle block for QwenImageVAE encoder and decoder.
-    Args:
-        dim (int): Number of input/output channels.
-        dropout (float): Dropout rate.
-        non_linearity (str): Type of non-linearity to use.
-    """
-    def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str = "silu", num_layers: int = 1):
-        super().__init__()
-        self.dim = dim
-        # Create the components
-        resnets = [QwenImageResidualBlock(dim, dim, dropout, non_linearity)]
-        attentions = []
-        for _ in range(num_layers):
-            attentions.append(QwenImageAttentionBlock(dim))
-            resnets.append(QwenImageResidualBlock(dim, dim, dropout, non_linearity))
-        self.attentions = nn.ModuleList(attentions)
-        self.resnets = nn.ModuleList(resnets)
-        self.gradient_checkpointing = False
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        # First residual block
-        x = self.resnets[0](x, feat_cache, feat_idx)
-        # Process through attention and residual blocks
-        for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            if attn is not None:
-                x = attn(x)
-            x = resnet(x, feat_cache, feat_idx)
-        return x
-class QwenImageEncoder3d(nn.Module):
-    r"""
-    A 3D encoder module.
-    Args:
-        dim (int): The base number of channels in the first layer.
-        z_dim (int): The dimensionality of the latent space.
-        dim_mult (list of int): Multipliers for the number of channels in each block.
-        num_res_blocks (int): Number of residual blocks in each block.
-        attn_scales (list of float): Scales at which to apply attention mechanisms.
-        temperal_downsample (list of bool): Whether to downsample temporally in each block.
-        dropout (float): Dropout rate for the dropout layers.
-        non_linearity (str): Type of non-linearity to use.
-    """
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, False],
-        dropout=0.0,
-        non_linearity: str = "silu",
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.nonlinearity = get_activation(non_linearity)
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-        # init block
-        self.conv_in = QwenImageCausalConv3d(3, dims[0], 3, padding=1)
-        # downsample blocks
-        self.down_blocks = nn.ModuleList([])
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            for _ in range(num_res_blocks):
-                self.down_blocks.append(QwenImageResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    self.down_blocks.append(QwenImageAttentionBlock(out_dim))
-                in_dim = out_dim
-            # downsample block
-            if i != len(dim_mult) - 1:
-                mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
-                self.down_blocks.append(QwenImageResample(out_dim, mode=mode))
-                scale /= 2.0
-        # middle blocks
-        self.mid_block = QwenImageMidBlock(out_dim, dropout, non_linearity, num_layers=1)
-        # output blocks
-        self.norm_out = QwenImageRMS_norm(out_dim, images=False)
-        self.conv_out = QwenImageCausalConv3d(out_dim, z_dim, 3, padding=1)
-        self.gradient_checkpointing = False
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv_in(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv_in(x)
-        ## downsamples
-        for layer in self.down_blocks:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## middle
-        x = self.mid_block(x, feat_cache, feat_idx)
-        ## head
-        x = self.norm_out(x)
-        x = self.nonlinearity(x)
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv_out(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv_out(x)
-        return x
-class QwenImageUpBlock(nn.Module):
-    """
-    A block that handles upsampling for the QwenImageVAE decoder.
-    Args:
-        in_dim (int): Input dimension
-        out_dim (int): Output dimension
-        num_res_blocks (int): Number of residual blocks
-        dropout (float): Dropout rate
-        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
-        non_linearity (str): Type of non-linearity to use
-    """
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        num_res_blocks: int,
-        dropout: float = 0.0,
-        upsample_mode: Optional[str] = None,
-        non_linearity: str = "silu",
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # Create layers list
-        resnets = []
-        # Add residual blocks and attention if needed
-        current_dim = in_dim
-        for _ in range(num_res_blocks + 1):
-            resnets.append(QwenImageResidualBlock(current_dim, out_dim, dropout, non_linearity))
-            current_dim = out_dim
-        self.resnets = nn.ModuleList(resnets)
-        # Add upsampling layer if needed
-        self.upsamplers = None
-        if upsample_mode is not None:
-            self.upsamplers = nn.ModuleList([QwenImageResample(out_dim, mode=upsample_mode)])
-        self.gradient_checkpointing = False
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        """
-        Forward pass through the upsampling block.
-        Args:
-            x (torch.Tensor): Input tensor
-            feat_cache (list, optional): Feature cache for causal convolutions
-            feat_idx (list, optional): Feature index for cache management
-        Returns:
-            torch.Tensor: Output tensor
-        """
-        for resnet in self.resnets:
-            if feat_cache is not None:
-                x = resnet(x, feat_cache, feat_idx)
-            else:
-                x = resnet(x)
-        if self.upsamplers is not None:
-            if feat_cache is not None:
-                x = self.upsamplers[0](x, feat_cache, feat_idx)
-            else:
-                x = self.upsamplers[0](x)
-        return x
-class QwenImageDecoder3d(nn.Module):
-    r"""
-    A 3D decoder module.
-    Args:
-        dim (int): The base number of channels in the first layer.
-        z_dim (int): The dimensionality of the latent space.
-        dim_mult (list of int): Multipliers for the number of channels in each block.
-        num_res_blocks (int): Number of residual blocks in each block.
-        attn_scales (list of float): Scales at which to apply attention mechanisms.
-        temperal_upsample (list of bool): Whether to upsample temporally in each block.
-        dropout (float): Dropout rate for the dropout layers.
-        non_linearity (str): Type of non-linearity to use.
-    """
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_upsample=[False, True, True],
-        dropout=0.0,
-        non_linearity: str = "silu",
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-        self.nonlinearity = get_activation(non_linearity)
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2 ** (len(dim_mult) - 2)
-        # init block
-        self.conv_in = QwenImageCausalConv3d(z_dim, dims[0], 3, padding=1)
-        # middle blocks
-        self.mid_block = QwenImageMidBlock(dims[0], dropout, non_linearity, num_layers=1)
-        # upsample blocks
-        self.up_blocks = nn.ModuleList([])
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            if i > 0:
-                in_dim = in_dim // 2
-            # Determine if we need upsampling
-            upsample_mode = None
-            if i != len(dim_mult) - 1:
-                upsample_mode = "upsample3d" if temperal_upsample[i] else "upsample2d"
-            # Create and add the upsampling block
-            up_block = QwenImageUpBlock(
-                in_dim=in_dim,
-                out_dim=out_dim,
-                num_res_blocks=num_res_blocks,
-                dropout=dropout,
-                upsample_mode=upsample_mode,
-                non_linearity=non_linearity,
-            )
-            self.up_blocks.append(up_block)
-            # Update scale for next iteration
-            if upsample_mode is not None:
-                scale *= 2.0
-        # output blocks
-        self.norm_out = QwenImageRMS_norm(out_dim, images=False)
-        self.conv_out = QwenImageCausalConv3d(out_dim, 3, 3, padding=1)
-        self.gradient_checkpointing = False
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        ## conv1
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv_in(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv_in(x)
-        ## middle
-        x = self.mid_block(x, feat_cache, feat_idx)
-        ## upsamples
-        for up_block in self.up_blocks:
-            x = up_block(x, feat_cache, feat_idx)
-        ## head
-        x = self.norm_out(x)
-        x = self.nonlinearity(x)
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-            x = self.conv_out(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv_out(x)
-        return x
-class AutoencoderKLQwenImage(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    """
-    _supports_gradient_checkpointing = False
-    # fmt: off
-    @register_to_config
-    def __init__(
-        self,
-        base_dim: int = 96,
-        z_dim: int = 16,
-        dim_mult: Tuple[int] = [1, 2, 4, 4],
-        num_res_blocks: int = 2,
-        attn_scales: List[float] = [],
-        temperal_downsample: List[bool] = [False, True, True],
-        dropout: float = 0.0,
-        latents_mean: List[float] = [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921],
-        latents_std: List[float] = [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160],
-    ) -> None:
-    # fmt: on
-        super().__init__()
-        self.z_dim = z_dim
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-        self.encoder = QwenImageEncoder3d(
-            base_dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout
-        )
-        self.quant_conv = QwenImageCausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.post_quant_conv = QwenImageCausalConv3d(z_dim, z_dim, 1)
-        self.decoder = QwenImageDecoder3d(
-            base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout
-        )
-        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
-        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
-        # to perform decoding of a single video latent at a time.
-        self.use_slicing = False
-        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
-        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
-        # intermediate tiles together, the memory requirement can be lowered.
-        self.use_tiling = False
-        # The minimal tile height and width for spatial tiling to be used
-        self.tile_sample_min_height = 256
-        self.tile_sample_min_width = 256
-        # The minimal distance between two spatial tiles
-        self.tile_sample_stride_height = 192
-        self.tile_sample_stride_width = 192
-        # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
-        self._cached_conv_counts = {
-            "decoder": sum(isinstance(m, QwenImageCausalConv3d) for m in self.decoder.modules())
-            if self.decoder is not None
-            else 0,
-            "encoder": sum(isinstance(m, QwenImageCausalConv3d) for m in self.encoder.modules())
-            if self.encoder is not None
-            else 0,
-        }
-    def enable_tiling(
-        self,
-        tile_sample_min_height: Optional[int] = None,
-        tile_sample_min_width: Optional[int] = None,
-        tile_sample_stride_height: Optional[float] = None,
-        tile_sample_stride_width: Optional[float] = None,
-    ) -> None:
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        Args:
-            tile_sample_min_height (`int`, *optional*):
-                The minimum height required for a sample to be separated into tiles across the height dimension.
-            tile_sample_min_width (`int`, *optional*):
-                The minimum width required for a sample to be separated into tiles across the width dimension.
-            tile_sample_stride_height (`int`, *optional*):
-                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
-                no tiling artifacts produced across the height dimension.
-            tile_sample_stride_width (`int`, *optional*):
-                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
-                artifacts produced across the width dimension.
-        """
-        self.use_tiling = True
-        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
-        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
-        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
-        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
-    def disable_tiling(self) -> None:
-        r"""
-        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_tiling = False
-    def enable_slicing(self) -> None:
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-    def disable_slicing(self) -> None:
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-    def clear_cache(self):
-        def _count_conv3d(model):
-            count = 0
-            for m in model.modules():
-                if isinstance(m, QwenImageCausalConv3d):
-                    count += 1
-            return count
-        self._conv_num = _count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        # cache encode
-        self._enc_conv_num = _count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
-    def _encode(self, x: torch.Tensor):
-        _, _, num_frame, height, width = x.shape
-        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
-            return self.tiled_encode(x)
-        self.clear_cache()
-        iter_ = 1 + (num_frame - 1) // 4
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx,
-                )
-                out = torch.cat([out, out_], 2)
-        enc = self.quant_conv(out)
-        self.clear_cache()
-        return enc
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        r"""
-        Encode a batch of images into latents.
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-                The latent representations of the encoded videos. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.Tensor, return_dict: bool = True):
-        _, _, num_frame, height, width = z.shape
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
-            return self.tiled_decode(z, return_dict=return_dict)
-        self.clear_cache()
-        x = self.post_quant_conv(z)
-        for i in range(num_frame):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
-            else:
-                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
-                out = torch.cat([out, out_], 2)
-        out = torch.clamp(out, min=-1.0, max=1.0)
-        self.clear_cache()
-        if not return_dict:
-            return (out,)
-        return DecoderOutput(sample=out)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
-                y / blend_extent
-            )
-        return b
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
-                x / blend_extent
-            )
-        return b
-    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-        Args:
-            x (`torch.Tensor`): Input batch of videos.
-        Returns:
-            `torch.Tensor`:
-                The latent representation of the encoded videos.
-        """
-        _, _, num_frames, height, width = x.shape
-        latent_height = height // self.spatial_compression_ratio
-        latent_width = width // self.spatial_compression_ratio
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
-        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
-        blend_height = tile_latent_min_height - tile_latent_stride_height
-        blend_width = tile_latent_min_width - tile_latent_stride_width
-        # Split x into overlapping tiles and encode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, self.tile_sample_stride_height):
-            row = []
-            for j in range(0, width, self.tile_sample_stride_width):
-                self.clear_cache()
-                time = []
-                frame_range = 1 + (num_frames - 1) // 4
-                for k in range(frame_range):
-                    self._enc_conv_idx = [0]
-                    if k == 0:
-                        tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
-                    else:
-                        tile = x[
-                            :,
-                            :,
-                            1 + 4 * (k - 1) : 1 + 4 * k,
-                            i : i + self.tile_sample_min_height,
-                            j : j + self.tile_sample_min_width,
-                        ]
-                    tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
-                    tile = self.quant_conv(tile)
-                    time.append(tile)
-                row.append(torch.cat(time, dim=2))
-            rows.append(row)
-        self.clear_cache()
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_width)
-                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
-            result_rows.append(torch.cat(result_row, dim=-1))
-        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
-        return enc
-    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        _, _, num_frames, height, width = z.shape
-        sample_height = height * self.spatial_compression_ratio
-        sample_width = width * self.spatial_compression_ratio
-        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
-        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
-        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
-        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
-        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
-        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
-        # Split z into overlapping tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, height, tile_latent_stride_height):
-            row = []
-            for j in range(0, width, tile_latent_stride_width):
-                self.clear_cache()
-                time = []
-                for k in range(num_frames):
-                    self._conv_idx = [0]
-                    tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
-                    tile = self.post_quant_conv(tile)
-                    decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx)
-                    time.append(decoded)
-                row.append(torch.cat(time, dim=2))
-            rows.append(row)
-        self.clear_cache()
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_width)
-                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
-            result_rows.append(torch.cat(result_row, dim=-1))
-        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        """
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z, return_dict=return_dict)
-        return dec

videox_fun/models/wan_animate_adapter.py DELETED Viewed

@@ -1,397 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-from typing import Optional, Tuple
-import torch
-import torch.nn.functional as F
-import numpy as np
-from einops import rearrange
-from torch import nn
-try:
-    from flash_attn import flash_attn_func, flash_attn_qkvpacked_func
-except ImportError:
-    flash_attn_func = None
-MEMORY_LAYOUT = {
-    "flash": (
-        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
-        lambda x: x,
-    ),
-    "torch": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-    "vanilla": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-}
-def attention(
-    q,
-    k,
-    v,
-    mode="flash",
-    drop_rate=0,
-    attn_mask=None,
-    causal=False,
-    max_seqlen_q=None,
-    batch_size=1,
-):
-    """
-    Perform QKV self attention.
-    Args:
-        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
-        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
-        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
-        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
-        drop_rate (float): Dropout rate in attention map. (default: 0)
-        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
-            (default: None)
-        causal (bool): Whether to use causal attention. (default: False)
-        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into q.
-        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into kv.
-        max_seqlen_q (int): The maximum sequence length in the batch of q.
-        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
-    Returns:
-        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
-    """
-    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
-    if mode == "torch":
-        if attn_mask is not None and attn_mask.dtype != torch.bool:
-            attn_mask = attn_mask.to(q.dtype)
-        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
-    elif mode == "flash":
-        x = flash_attn_func(
-            q,
-            k,
-            v,
-        )
-        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # reshape x to [b, s, a, d]
-    elif mode == "vanilla":
-        scale_factor = 1 / math.sqrt(q.size(-1))
-        b, a, s, _ = q.shape
-        s1 = k.size(2)
-        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
-        if causal:
-            # Only applied to self attention
-            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
-            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias.to(q.dtype)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        attn = (q @ k.transpose(-2, -1)) * scale_factor
-        attn += attn_bias
-        attn = attn.softmax(dim=-1)
-        attn = torch.dropout(attn, p=drop_rate, train=True)
-        x = attn @ v
-    else:
-        raise NotImplementedError(f"Unsupported attention mode: {mode}")
-    x = post_attn_layout(x)
-    b, s, a, d = x.shape
-    out = x.reshape(b, s, -1)
-    return out
-class CausalConv1d(nn.Module):
-    def __init__(self, chan_in, chan_out, kernel_size=3, stride=1, dilation=1, pad_mode="replicate", **kwargs):
-        super().__init__()
-        self.pad_mode = pad_mode
-        padding = (kernel_size - 1, 0)  # T
-        self.time_causal_padding = padding
-        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs)
-    def forward(self, x):
-        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
-        return self.conv(x)
-class FaceEncoder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, num_heads=int, dtype=None, device=None):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.num_heads = num_heads
-        self.conv1_local = CausalConv1d(in_dim, 1024 * num_heads, 3, stride=1)
-        self.norm1 = nn.LayerNorm(hidden_dim // 8, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.act = nn.SiLU()
-        self.conv2 = CausalConv1d(1024, 1024, 3, stride=2)
-        self.conv3 = CausalConv1d(1024, 1024, 3, stride=2)
-        self.out_proj = nn.Linear(1024, hidden_dim)
-        self.norm1 = nn.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.norm2 = nn.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.norm3 = nn.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.padding_tokens = nn.Parameter(torch.zeros(1, 1, 1, hidden_dim))
-    def forward(self, x):
-        x = rearrange(x, "b t c -> b c t")
-        b, c, t = x.shape
-        x = self.conv1_local(x)
-        x = rearrange(x, "b (n c) t -> (b n) t c", n=self.num_heads)
-        x = self.norm1(x)
-        x = self.act(x)
-        x = rearrange(x, "b t c -> b c t")
-        x = self.conv2(x)
-        x = rearrange(x, "b c t -> b t c")
-        x = self.norm2(x)
-        x = self.act(x)
-        x = rearrange(x, "b t c -> b c t")
-        x = self.conv3(x)
-        x = rearrange(x, "b c t -> b t c")
-        x = self.norm3(x)
-        x = self.act(x)
-        x = self.out_proj(x)
-        x = rearrange(x, "(b n) t c -> b t n c", b=b)
-        padding = self.padding_tokens.repeat(b, x.shape[1], 1, 1)
-        x = torch.cat([x, padding], dim=-2)
-        x_local = x.clone()
-        return x_local
-class RMSNorm(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        elementwise_affine=True,
-        eps: float = 1e-6,
-        device=None,
-        dtype=None,
-    ):
-        """
-        Initialize the RMSNorm normalization layer.
-        Args:
-            dim (int): The dimension of the input tensor.
-            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-        Attributes:
-            eps (float): A small value added to the denominator for numerical stability.
-            weight (nn.Parameter): Learnable scaling parameter.
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if elementwise_affine:
-            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        output = self._norm(x.float()).type_as(x)
-        if hasattr(self, "weight"):
-            output = output * self.weight
-        return output
-def get_norm_layer(norm_layer):
-    """
-    Get the normalization layer.
-    Args:
-        norm_layer (str): The type of normalization layer.
-    Returns:
-        norm_layer (nn.Module): The normalization layer.
-    """
-    if norm_layer == "layer":
-        return nn.LayerNorm
-    elif norm_layer == "rms":
-        return RMSNorm
-    else:
-        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
-class FaceAdapter(nn.Module):
-    def __init__(
-        self,
-        hidden_dim: int,
-        heads_num: int,
-        qk_norm: bool = True,
-        qk_norm_type: str = "rms",
-        num_adapter_layers: int = 1,
-        dtype=None,
-        device=None,
-    ):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.hidden_size = hidden_dim
-        self.heads_num = heads_num
-        self.fuser_blocks = nn.ModuleList(
-            [
-                FaceBlock(
-                    self.hidden_size,
-                    self.heads_num,
-                    qk_norm=qk_norm,
-                    qk_norm_type=qk_norm_type,
-                    **factory_kwargs,
-                )
-                for _ in range(num_adapter_layers)
-            ]
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        motion_embed: torch.Tensor,
-        idx: int,
-        freqs_cis_q: Tuple[torch.Tensor, torch.Tensor] = None,
-        freqs_cis_k: Tuple[torch.Tensor, torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return self.fuser_blocks[idx](x, motion_embed, freqs_cis_q, freqs_cis_k)
-class FaceBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        heads_num: int,
-        qk_norm: bool = True,
-        qk_norm_type: str = "rms",
-        qk_scale: float = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.deterministic = False
-        self.hidden_size = hidden_size
-        self.heads_num = heads_num
-        head_dim = hidden_size // heads_num
-        self.scale = qk_scale or head_dim**-0.5
-        self.linear1_kv = nn.Linear(hidden_size, hidden_size * 2, **factory_kwargs)
-        self.linear1_q = nn.Linear(hidden_size, hidden_size, **factory_kwargs)
-        self.linear2 = nn.Linear(hidden_size, hidden_size, **factory_kwargs)
-        qk_norm_layer = get_norm_layer(qk_norm_type)
-        self.q_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
-        )
-        self.k_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity()
-        )
-        self.pre_norm_feat = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.pre_norm_motion = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-    def forward(
-        self,
-        x: torch.Tensor,
-        motion_vec: torch.Tensor,
-        motion_mask: Optional[torch.Tensor] = None,
-        use_context_parallel=False,
-        all_gather=None,
-        sp_world_size=1,
-        sp_world_rank=0,
-    ) -> torch.Tensor:
-        dtype = x.dtype
-        B, T, N, C = motion_vec.shape
-        T_comp = T
-        x_motion = self.pre_norm_motion(motion_vec)
-        x_feat = self.pre_norm_feat(x)
-        kv = self.linear1_kv(x_motion)
-        q = self.linear1_q(x_feat)
-        k, v = rearrange(kv, "B L N (K H D) -> K B L N H D", K=2, H=self.heads_num)
-        q = rearrange(q, "B S (H D) -> B S H D", H=self.heads_num)
-        # Apply QK-Norm if needed.
-        q = self.q_norm(q).to(v)
-        k = self.k_norm(k).to(v)
-        k = rearrange(k, "B L N H D -> (B L) N H D")
-        v = rearrange(v, "B L N H D -> (B L) N H D")
-        if use_context_parallel:
-            q = all_gather(q, dim=1)
-            length = int(np.floor(q.size()[1] / T_comp) * T_comp)
-            origin_length = q.size()[1]
-            if origin_length > length:
-                q_pad = q[:, length:]
-                q = q[:, :length]
-        q = rearrange(q, "B (L S) H D -> (B L) S H D", L=T_comp)
-        q, k, v = q.to(dtype), k.to(dtype), v.to(dtype)
-        # Compute attention.
-        attn = attention(
-            q,
-            k,
-            v,
-            max_seqlen_q=q.shape[1],
-            batch_size=q.shape[0],
-        )
-        attn = rearrange(attn, "(B L) S C -> B (L S) C", L=T_comp)
-        if use_context_parallel:
-            q_pad = rearrange(q_pad, "B L H D -> B L (H D)")
-            if origin_length > length:
-                attn = torch.cat([attn, q_pad], dim=1)
-            attn = torch.chunk(attn, sp_world_size, dim=1)[sp_world_rank]
-        output = self.linear2(attn)
-        if motion_mask is not None:
-            output = output * rearrange(motion_mask, "B T H W -> B (T H W)").unsqueeze(-1)
-        return output

videox_fun/models/wan_animate_motion_encoder.py DELETED Viewed

@@ -1,309 +0,0 @@
-# Modified from ``https://github.com/wyhsirius/LIA``
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-def custom_qr(input_tensor):
-    original_dtype = input_tensor.dtype
-    if original_dtype == torch.bfloat16:
-        q, r = torch.linalg.qr(input_tensor.to(torch.float32))
-        return q.to(original_dtype), r.to(original_dtype)
-    return torch.linalg.qr(input_tensor)
-def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
-	return F.leaky_relu(input + bias, negative_slope) * scale
-def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1):
-	_, minor, in_h, in_w = input.shape
-	kernel_h, kernel_w = kernel.shape
-	out = input.view(-1, minor, in_h, 1, in_w, 1)
-	out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0])
-	out = out.view(-1, minor, in_h * up_y, in_w * up_x)
-	out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
-	out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0),
-		  max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0), ]
-	out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
-	w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
-	out = F.conv2d(out, w)
-	out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-					  in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, )
-	return out[:, :, ::down_y, ::down_x]
-def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
-	return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1])
-def make_kernel(k):
-	k = torch.tensor(k, dtype=torch.float32)
-	if k.ndim == 1:
-		k = k[None, :] * k[:, None]
-	k /= k.sum()
-	return k
-class FusedLeakyReLU(nn.Module):
-	def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5):
-		super().__init__()
-		self.bias = nn.Parameter(torch.zeros(1, channel, 1, 1))
-		self.negative_slope = negative_slope
-		self.scale = scale
-	def forward(self, input):
-		out = fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
-		return out
-class Blur(nn.Module):
-	def __init__(self, kernel, pad, upsample_factor=1):
-		super().__init__()
-		kernel = make_kernel(kernel)
-		if upsample_factor > 1:
-			kernel = kernel * (upsample_factor ** 2)
-		self.register_buffer('kernel', kernel)
-		self.pad = pad
-	def forward(self, input):
-		return upfirdn2d(input, self.kernel, pad=self.pad)
-class ScaledLeakyReLU(nn.Module):
-	def __init__(self, negative_slope=0.2):
-		super().__init__()
-		self.negative_slope = negative_slope
-	def forward(self, input):
-		return F.leaky_relu(input, negative_slope=self.negative_slope)
-class EqualConv2d(nn.Module):
-	def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True):
-		super().__init__()
-		self.weight = nn.Parameter(torch.randn(out_channel, in_channel, kernel_size, kernel_size))
-		self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2)
-		self.stride = stride
-		self.padding = padding
-		if bias:
-			self.bias = nn.Parameter(torch.zeros(out_channel))
-		else:
-			self.bias = None
-	def forward(self, input):
-		return F.conv2d(input, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding)
-	def __repr__(self):
-		return (
-			f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
-			f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
-		)
-class EqualLinear(nn.Module):
-	def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None):
-		super().__init__()
-		self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
-		if bias:
-			self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
-		else:
-			self.bias = None
-		self.activation = activation
-		self.scale = (1 / math.sqrt(in_dim)) * lr_mul
-		self.lr_mul = lr_mul
-	def forward(self, input):
-		if self.activation:
-			out = F.linear(input, self.weight * self.scale)
-			out = fused_leaky_relu(out, self.bias * self.lr_mul)
-		else:
-			out = F.linear(input, self.weight * self.scale, bias=self.bias * self.lr_mul)
-		return out
-	def __repr__(self):
-		return (f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})')
-class ConvLayer(nn.Sequential):
-	def __init__(
-			self,
-			in_channel,
-			out_channel,
-			kernel_size,
-			downsample=False,
-			blur_kernel=[1, 3, 3, 1],
-			bias=True,
-			activate=True,
-	):
-		layers = []
-		if downsample:
-			factor = 2
-			p = (len(blur_kernel) - factor) + (kernel_size - 1)
-			pad0 = (p + 1) // 2
-			pad1 = p // 2
-			layers.append(Blur(blur_kernel, pad=(pad0, pad1)))
-			stride = 2
-			self.padding = 0
-		else:
-			stride = 1
-			self.padding = kernel_size // 2
-		layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=self.padding, stride=stride,
-								  bias=bias and not activate))
-		if activate:
-			if bias:
-				layers.append(FusedLeakyReLU(out_channel))
-			else:
-				layers.append(ScaledLeakyReLU(0.2))
-		super().__init__(*layers)
-class ResBlock(nn.Module):
-	def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
-		super().__init__()
-		self.conv1 = ConvLayer(in_channel, in_channel, 3)
-		self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True)
-		self.skip = ConvLayer(in_channel, out_channel, 1, downsample=True, activate=False, bias=False)
-	def forward(self, input):
-		out = self.conv1(input)
-		out = self.conv2(out)
-		skip = self.skip(input)
-		out = (out + skip) / math.sqrt(2)
-		return out
-class EncoderApp(nn.Module):
-	def __init__(self, size, w_dim=512):
-		super(EncoderApp, self).__init__()
-		channels = {
-			4: 512,
-			8: 512,
-			16: 512,
-			32: 512,
-			64: 256,
-			128: 128,
-			256: 64,
-			512: 32,
-			1024: 16
-		}
-		self.w_dim = w_dim
-		log_size = int(math.log(size, 2))
-		self.convs = nn.ModuleList()
-		self.convs.append(ConvLayer(3, channels[size], 1))
-		in_channel = channels[size]
-		for i in range(log_size, 2, -1):
-			out_channel = channels[2 ** (i - 1)]
-			self.convs.append(ResBlock(in_channel, out_channel))
-			in_channel = out_channel
-		self.convs.append(EqualConv2d(in_channel, self.w_dim, 4, padding=0, bias=False))
-	def forward(self, x):
-		res = []
-		h = x
-		for conv in self.convs:
-			h = conv(h)
-			res.append(h)
-		return res[-1].squeeze(-1).squeeze(-1), res[::-1][2:]
-class Encoder(nn.Module):
-	def __init__(self, size, dim=512, dim_motion=20):
-		super(Encoder, self).__init__()
-		# appearance netmork
-		self.net_app = EncoderApp(size, dim)
-		# motion network
-		fc = [EqualLinear(dim, dim)]
-		for i in range(3):
-			fc.append(EqualLinear(dim, dim))
-		fc.append(EqualLinear(dim, dim_motion))
-		self.fc = nn.Sequential(*fc)
-	def enc_app(self, x):
-		h_source = self.net_app(x)
-		return h_source
-	def enc_motion(self, x):
-		h, _ = self.net_app(x)
-		h_motion = self.fc(h)
-		return h_motion
-class Direction(nn.Module):
-    def __init__(self, motion_dim):
-        super(Direction, self).__init__()
-        self.weight = nn.Parameter(torch.randn(512, motion_dim))
-    def forward(self, input):
-        weight = self.weight + 1e-8
-        Q, R = custom_qr(weight)
-        if input is None:
-            return Q
-        else:
-            input_diag = torch.diag_embed(input)  # alpha, diagonal matrix
-            out = torch.matmul(input_diag, Q.T)
-            out = torch.sum(out, dim=1)
-            return out
-class Synthesis(nn.Module):
-    def __init__(self, motion_dim):
-        super(Synthesis, self).__init__()
-        self.direction = Direction(motion_dim)
-class Generator(nn.Module):
-    def __init__(self, size, style_dim=512, motion_dim=20):
-        super().__init__()
-        self.enc = Encoder(size, style_dim, motion_dim)
-        self.dec = Synthesis(motion_dim)
-    def get_motion(self, img):
-        #motion_feat = self.enc.enc_motion(img)
-        motion_feat = torch.utils.checkpoint.checkpoint((self.enc.enc_motion), img, use_reentrant=True)
-        with torch.cuda.amp.autocast(dtype=torch.float32):
-            motion = self.dec.direction(motion_feat)
-        return motion

videox_fun/models/wan_audio_encoder.py DELETED Viewed

@@ -1,213 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/s2v/audio_encoder.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import librosa
-import numpy as np
-import torch
-import torch.nn.functional as F
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.modeling_utils import ModelMixin
-def get_sample_indices(original_fps,
-                       total_frames,
-                       target_fps,
-                       num_sample,
-                       fixed_start=None):
-    required_duration = num_sample / target_fps
-    required_origin_frames = int(np.ceil(required_duration * original_fps))
-    if required_duration > total_frames / original_fps:
-        raise ValueError("required_duration must be less than video length")
-    if not fixed_start is None and fixed_start >= 0:
-        start_frame = fixed_start
-    else:
-        max_start = total_frames - required_origin_frames
-        if max_start < 0:
-            raise ValueError("video length is too short")
-        start_frame = np.random.randint(0, max_start + 1)
-    start_time = start_frame / original_fps
-    end_time = start_time + required_duration
-    time_points = np.linspace(start_time, end_time, num_sample, endpoint=False)
-    frame_indices = np.round(np.array(time_points) * original_fps).astype(int)
-    frame_indices = np.clip(frame_indices, 0, total_frames - 1)
-    return frame_indices
-def linear_interpolation(features, input_fps, output_fps, output_len=None):
-    """
-    features: shape=[1, T, 512]
-    input_fps: fps for audio, f_a
-    output_fps: fps for video, f_m
-    output_len: video length
-    """
-    features = features.transpose(1, 2)  # [1, 512, T]
-    seq_len = features.shape[2] / float(input_fps)  # T/f_a
-    if output_len is None:
-        output_len = int(seq_len * output_fps)  # f_m*T/f_a
-    output_features = F.interpolate(
-        features, size=output_len, align_corners=True,
-        mode='linear')  # [1, 512, output_len]
-    return output_features.transpose(1, 2)  # [1, output_len, 512]
-class WanAudioEncoder(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    def __init__(self, pretrained_model_path="facebook/wav2vec2-base-960h", device='cpu'):
-        super(WanAudioEncoder, self).__init__()
-        # load pretrained model
-        self.processor = Wav2Vec2Processor.from_pretrained(pretrained_model_path)
-        self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_model_path)
-        self.model = self.model.to(device)
-        self.video_rate = 30
-    def extract_audio_feat(self,
-                           audio_path,
-                           return_all_layers=False,
-                           dtype=torch.float32):
-        audio_input, sample_rate = librosa.load(audio_path, sr=16000)
-        input_values = self.processor(
-            audio_input, sampling_rate=sample_rate, return_tensors="pt"
-        ).input_values
-        # INFERENCE
-        # retrieve logits & take argmax
-        res = self.model(
-            input_values.to(self.model.device), output_hidden_states=True)
-        if return_all_layers:
-            feat = torch.cat(res.hidden_states)
-        else:
-            feat = res.hidden_states[-1]
-        feat = linear_interpolation(
-            feat, input_fps=50, output_fps=self.video_rate)
-        z = feat.to(dtype)  # Encoding for the motion
-        return z
-    def extract_audio_feat_without_file_load(self, audio_input, sample_rate, return_all_layers=False, dtype=torch.float32):
-        input_values = self.processor(
-            audio_input, sampling_rate=sample_rate, return_tensors="pt"
-        ).input_values
-        # INFERENCE
-        # retrieve logits & take argmax
-        res = self.model(
-            input_values.to(self.model.device), output_hidden_states=True)
-        if return_all_layers:
-            feat = torch.cat(res.hidden_states)
-        else:
-            feat = res.hidden_states[-1]
-        feat = linear_interpolation(
-            feat, input_fps=50, output_fps=self.video_rate)
-        z = feat.to(dtype)  # Encoding for the motion
-        return z
-    def get_audio_embed_bucket(self,
-                               audio_embed,
-                               stride=2,
-                               batch_frames=12,
-                               m=2):
-        num_layers, audio_frame_num, audio_dim = audio_embed.shape
-        if num_layers > 1:
-            return_all_layers = True
-        else:
-            return_all_layers = False
-        min_batch_num = int(audio_frame_num / (batch_frames * stride)) + 1
-        bucket_num = min_batch_num * batch_frames
-        batch_idx = [stride * i for i in range(bucket_num)]
-        batch_audio_eb = []
-        for bi in batch_idx:
-            if bi < audio_frame_num:
-                audio_sample_stride = 2
-                chosen_idx = list(
-                    range(bi - m * audio_sample_stride,
-                          bi + (m + 1) * audio_sample_stride,
-                          audio_sample_stride))
-                chosen_idx = [0 if c < 0 else c for c in chosen_idx]
-                chosen_idx = [
-                    audio_frame_num - 1 if c >= audio_frame_num else c
-                    for c in chosen_idx
-                ]
-                if return_all_layers:
-                    frame_audio_embed = audio_embed[:, chosen_idx].flatten(
-                        start_dim=-2, end_dim=-1)
-                else:
-                    frame_audio_embed = audio_embed[0][chosen_idx].flatten()
-            else:
-                frame_audio_embed = \
-                torch.zeros([audio_dim * (2 * m + 1)], device=audio_embed.device) if not return_all_layers \
-                    else torch.zeros([num_layers, audio_dim * (2 * m + 1)], device=audio_embed.device)
-            batch_audio_eb.append(frame_audio_embed)
-        batch_audio_eb = torch.cat([c.unsqueeze(0) for c in batch_audio_eb],
-                                   dim=0)
-        return batch_audio_eb, min_batch_num
-    def get_audio_embed_bucket_fps(self,
-                                   audio_embed,
-                                   fps=16,
-                                   batch_frames=81,
-                                   m=0):
-        num_layers, audio_frame_num, audio_dim = audio_embed.shape
-        if num_layers > 1:
-            return_all_layers = True
-        else:
-            return_all_layers = False
-        scale = self.video_rate / fps
-        min_batch_num = int(audio_frame_num / (batch_frames * scale)) + 1
-        bucket_num = min_batch_num * batch_frames
-        padd_audio_num = math.ceil(min_batch_num * batch_frames / fps *
-                                   self.video_rate) - audio_frame_num
-        batch_idx = get_sample_indices(
-            original_fps=self.video_rate,
-            total_frames=audio_frame_num + padd_audio_num,
-            target_fps=fps,
-            num_sample=bucket_num,
-            fixed_start=0)
-        batch_audio_eb = []
-        audio_sample_stride = int(self.video_rate / fps)
-        for bi in batch_idx:
-            if bi < audio_frame_num:
-                chosen_idx = list(
-                    range(bi - m * audio_sample_stride,
-                          bi + (m + 1) * audio_sample_stride,
-                          audio_sample_stride))
-                chosen_idx = [0 if c < 0 else c for c in chosen_idx]
-                chosen_idx = [
-                    audio_frame_num - 1 if c >= audio_frame_num else c
-                    for c in chosen_idx
-                ]
-                if return_all_layers:
-                    frame_audio_embed = audio_embed[:, chosen_idx].flatten(
-                        start_dim=-2, end_dim=-1)
-                else:
-                    frame_audio_embed = audio_embed[0][chosen_idx].flatten()
-            else:
-                frame_audio_embed = \
-                torch.zeros([audio_dim * (2 * m + 1)], device=audio_embed.device) if not return_all_layers \
-                    else torch.zeros([num_layers, audio_dim * (2 * m + 1)], device=audio_embed.device)
-            batch_audio_eb.append(frame_audio_embed)
-        batch_audio_eb = torch.cat([c.unsqueeze(0) for c in batch_audio_eb],
-                                   dim=0)
-        return batch_audio_eb, min_batch_num

videox_fun/models/wan_audio_injector.py DELETED Viewed

@@ -1,1093 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/s2v/motioner.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import importlib.metadata
-import math
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.models import ModelMixin
-from diffusers.models.attention import AdaLayerNorm
-from diffusers.utils import BaseOutput, is_torch_version, logging
-from einops import rearrange, repeat
-from .attention_utils import attention
-from .wan_transformer3d import WanAttentionBlock, WanCrossAttention
-def rope_precompute(x, grid_sizes, freqs, start=None):
-    b, s, n, c = x.size(0), x.size(1), x.size(2), x.size(3) // 2
-    # split freqs
-    if type(freqs) is list:
-        trainable_freqs = freqs[1]
-        freqs = freqs[0]
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = torch.view_as_complex(x.detach().reshape(b, s, n, -1,
-                                                      2).to(torch.float64))
-    seq_bucket = [0]
-    if not type(grid_sizes) is list:
-        grid_sizes = [grid_sizes]
-    for g in grid_sizes:
-        if not type(g) is list:
-            g = [torch.zeros_like(g), g]
-        batch_size = g[0].shape[0]
-        for i in range(batch_size):
-            if start is None:
-                f_o, h_o, w_o = g[0][i]
-            else:
-                f_o, h_o, w_o = start[i]
-            f, h, w = g[1][i]
-            t_f, t_h, t_w = g[2][i]
-            seq_f, seq_h, seq_w = f - f_o, h - h_o, w - w_o
-            seq_len = int(seq_f * seq_h * seq_w)
-            if seq_len > 0:
-                if t_f > 0:
-                    factor_f, factor_h, factor_w = (t_f / seq_f).item(), (
-                        t_h / seq_h).item(), (t_w / seq_w).item()
-                    # Generate a list of seq_f integers starting from f_o and ending at math.ceil(factor_f * seq_f.item() + f_o.item())
-                    if f_o >= 0:
-                        f_sam = np.linspace(f_o.item(), (t_f + f_o).item() - 1,
-                                            seq_f).astype(int).tolist()
-                    else:
-                        f_sam = np.linspace(-f_o.item(),
-                                            (-t_f - f_o).item() + 1,
-                                            seq_f).astype(int).tolist()
-                    h_sam = np.linspace(h_o.item(), (t_h + h_o).item() - 1,
-                                        seq_h).astype(int).tolist()
-                    w_sam = np.linspace(w_o.item(), (t_w + w_o).item() - 1,
-                                        seq_w).astype(int).tolist()
-                    assert f_o * f >= 0 and h_o * h >= 0 and w_o * w >= 0
-                    freqs_0 = freqs[0][f_sam] if f_o >= 0 else freqs[0][
-                        f_sam].conj()
-                    freqs_0 = freqs_0.view(seq_f, 1, 1, -1)
-                    freqs_i = torch.cat([
-                        freqs_0.expand(seq_f, seq_h, seq_w, -1),
-                        freqs[1][h_sam].view(1, seq_h, 1, -1).expand(
-                            seq_f, seq_h, seq_w, -1),
-                        freqs[2][w_sam].view(1, 1, seq_w, -1).expand(
-                            seq_f, seq_h, seq_w, -1),
-                    ],
-                                        dim=-1).reshape(seq_len, 1, -1)
-                elif t_f < 0:
-                    freqs_i = trainable_freqs.unsqueeze(1)
-                # apply rotary embedding
-                output[i, seq_bucket[-1]:seq_bucket[-1] + seq_len] = freqs_i
-        seq_bucket.append(seq_bucket[-1] + seq_len)
-    return output
-def sinusoidal_embedding_1d(dim, position):
-    # preprocess
-    assert dim % 2 == 0
-    half = dim // 2
-    position = position.type(torch.float64)
-    # calculation
-    sinusoid = torch.outer(
-        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
-    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
-    return x
-@amp.autocast(enabled=False)
-def rope_params(max_seq_len, dim, theta=10000):
-    assert dim % 2 == 0
-    freqs = torch.outer(
-        torch.arange(max_seq_len),
-        1.0 / torch.pow(theta,
-                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
-    freqs = torch.polar(torch.ones_like(freqs), freqs)
-    return freqs
-@amp.autocast(enabled=False)
-def rope_apply(x, grid_sizes, freqs, start=None):
-    n, c = x.size(2), x.size(3) // 2
-    # split freqs
-    if type(freqs) is list:
-        trainable_freqs = freqs[1]
-        freqs = freqs[0]
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    output = x.clone()
-    seq_bucket = [0]
-    if not type(grid_sizes) is list:
-        grid_sizes = [grid_sizes]
-    for g in grid_sizes:
-        if not type(g) is list:
-            g = [torch.zeros_like(g), g]
-        batch_size = g[0].shape[0]
-        for i in range(batch_size):
-            if start is None:
-                f_o, h_o, w_o = g[0][i]
-            else:
-                f_o, h_o, w_o = start[i]
-            f, h, w = g[1][i]
-            t_f, t_h, t_w = g[2][i]
-            seq_f, seq_h, seq_w = f - f_o, h - h_o, w - w_o
-            seq_len = int(seq_f * seq_h * seq_w)
-            if seq_len > 0:
-                if t_f > 0:
-                    factor_f, factor_h, factor_w = (t_f / seq_f).item(), (
-                        t_h / seq_h).item(), (t_w / seq_w).item()
-                    if f_o >= 0:
-                        f_sam = np.linspace(f_o.item(), (t_f + f_o).item() - 1,
-                                            seq_f).astype(int).tolist()
-                    else:
-                        f_sam = np.linspace(-f_o.item(),
-                                            (-t_f - f_o).item() + 1,
-                                            seq_f).astype(int).tolist()
-                    h_sam = np.linspace(h_o.item(), (t_h + h_o).item() - 1,
-                                        seq_h).astype(int).tolist()
-                    w_sam = np.linspace(w_o.item(), (t_w + w_o).item() - 1,
-                                        seq_w).astype(int).tolist()
-                    assert f_o * f >= 0 and h_o * h >= 0 and w_o * w >= 0
-                    freqs_0 = freqs[0][f_sam] if f_o >= 0 else freqs[0][
-                        f_sam].conj()
-                    freqs_0 = freqs_0.view(seq_f, 1, 1, -1)
-                    freqs_i = torch.cat([
-                        freqs_0.expand(seq_f, seq_h, seq_w, -1),
-                        freqs[1][h_sam].view(1, seq_h, 1, -1).expand(
-                            seq_f, seq_h, seq_w, -1),
-                        freqs[2][w_sam].view(1, 1, seq_w, -1).expand(
-                            seq_f, seq_h, seq_w, -1),
-                    ],
-                                        dim=-1).reshape(seq_len, 1, -1)
-                elif t_f < 0:
-                    freqs_i = trainable_freqs.unsqueeze(1)
-                # apply rotary embedding
-                # precompute multipliers
-                x_i = torch.view_as_complex(
-                    x[i, seq_bucket[-1]:seq_bucket[-1] + seq_len].to(
-                        torch.float64).reshape(seq_len, n, -1, 2))
-                x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
-                output[i, seq_bucket[-1]:seq_bucket[-1] + seq_len] = x_i
-        seq_bucket.append(seq_bucket[-1] + seq_len)
-    return output.float()
-class CausalConv1d(nn.Module):
-    def __init__(self,
-                 chan_in,
-                 chan_out,
-                 kernel_size=3,
-                 stride=1,
-                 dilation=1,
-                 pad_mode='replicate',
-                 **kwargs):
-        super().__init__()
-        self.pad_mode = pad_mode
-        padding = (kernel_size - 1, 0)  # T
-        self.time_causal_padding = padding
-        self.conv = nn.Conv1d(
-            chan_in,
-            chan_out,
-            kernel_size,
-            stride=stride,
-            dilation=dilation,
-            **kwargs)
-    def forward(self, x):
-        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
-        return self.conv(x)
-class MotionEncoder_tc(nn.Module):
-    def __init__(self,
-                 in_dim: int,
-                 hidden_dim: int,
-                 num_heads=int,
-                 need_global=True,
-                 dtype=None,
-                 device=None):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.num_heads = num_heads
-        self.need_global = need_global
-        self.conv1_local = CausalConv1d(
-            in_dim, hidden_dim // 4 * num_heads, 3, stride=1)
-        if need_global:
-            self.conv1_global = CausalConv1d(
-                in_dim, hidden_dim // 4, 3, stride=1)
-        self.norm1 = nn.LayerNorm(
-            hidden_dim // 4,
-            elementwise_affine=False,
-            eps=1e-6,
-            **factory_kwargs)
-        self.act = nn.SiLU()
-        self.conv2 = CausalConv1d(hidden_dim // 4, hidden_dim // 2, 3, stride=2)
-        self.conv3 = CausalConv1d(hidden_dim // 2, hidden_dim, 3, stride=2)
-        if need_global:
-            self.final_linear = nn.Linear(hidden_dim, hidden_dim,
-                                          **factory_kwargs)
-        self.norm1 = nn.LayerNorm(
-            hidden_dim // 4,
-            elementwise_affine=False,
-            eps=1e-6,
-            **factory_kwargs)
-        self.norm2 = nn.LayerNorm(
-            hidden_dim // 2,
-            elementwise_affine=False,
-            eps=1e-6,
-            **factory_kwargs)
-        self.norm3 = nn.LayerNorm(
-            hidden_dim, elementwise_affine=False, eps=1e-6, **factory_kwargs)
-        self.padding_tokens = nn.Parameter(torch.zeros(1, 1, 1, hidden_dim))
-    def forward(self, x):
-        x = rearrange(x, 'b t c -> b c t')
-        x_ori = x.clone()
-        b, c, t = x.shape
-        x = self.conv1_local(x)
-        x = rearrange(x, 'b (n c) t -> (b n) t c', n=self.num_heads)
-        x = self.norm1(x)
-        x = self.act(x)
-        x = rearrange(x, 'b t c -> b c t')
-        x = self.conv2(x)
-        x = rearrange(x, 'b c t -> b t c')
-        x = self.norm2(x)
-        x = self.act(x)
-        x = rearrange(x, 'b t c -> b c t')
-        x = self.conv3(x)
-        x = rearrange(x, 'b c t -> b t c')
-        x = self.norm3(x)
-        x = self.act(x)
-        x = rearrange(x, '(b n) t c -> b t n c', b=b)
-        padding = self.padding_tokens.repeat(b, x.shape[1], 1, 1)
-        x = torch.cat([x, padding], dim=-2)
-        x_local = x.clone()
-        if not self.need_global:
-            return x_local
-        x = self.conv1_global(x_ori)
-        x = rearrange(x, 'b c t -> b t c')
-        x = self.norm1(x)
-        x = self.act(x)
-        x = rearrange(x, 'b t c -> b c t')
-        x = self.conv2(x)
-        x = rearrange(x, 'b c t -> b t c')
-        x = self.norm2(x)
-        x = self.act(x)
-        x = rearrange(x, 'b t c -> b c t')
-        x = self.conv3(x)
-        x = rearrange(x, 'b c t -> b t c')
-        x = self.norm3(x)
-        x = self.act(x)
-        x = self.final_linear(x)
-        x = rearrange(x, '(b n) t c -> b t n c', b=b)
-        return x, x_local
-class CausalAudioEncoder(nn.Module):
-    def __init__(self,
-                 dim=5120,
-                 num_layers=25,
-                 out_dim=2048,
-                 video_rate=8,
-                 num_token=4,
-                 need_global=False):
-        super().__init__()
-        self.encoder = MotionEncoder_tc(
-            in_dim=dim,
-            hidden_dim=out_dim,
-            num_heads=num_token,
-            need_global=need_global)
-        weight = torch.ones((1, num_layers, 1, 1)) * 0.01
-        self.weights = torch.nn.Parameter(weight)
-        self.act = torch.nn.SiLU()
-    def forward(self, features):
-        with amp.autocast(dtype=torch.float32):
-            # features B * num_layers * dim * video_length
-            weights = self.act(self.weights)
-            weights_sum = weights.sum(dim=1, keepdims=True)
-            weighted_feat = ((features * weights) / weights_sum).sum(
-                dim=1)  # b dim f
-            weighted_feat = weighted_feat.permute(0, 2, 1)  # b f dim
-            res = self.encoder(weighted_feat)  # b f n dim
-        return res  # b f n dim
-class AudioCrossAttention(WanCrossAttention):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def forward(self, x, context, context_lens, dtype=torch.bfloat16, t=0):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-            context_lens(Tensor): Shape [B]
-        """
-        b, n, d = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.norm_q(self.q(x.to(dtype))).view(b, -1, n, d)
-        k = self.norm_k(self.k(context.to(dtype))).view(b, -1, n, d)
-        v = self.v(context.to(dtype)).view(b, -1, n, d)
-        # compute attention
-        x = attention(q.to(dtype), k.to(dtype), v.to(dtype), k_lens=context_lens, attention_type="FLASH_ATTENTION")
-        # output
-        x = x.flatten(2)
-        x = self.o(x.to(dtype))
-        return x
-class AudioInjector_WAN(nn.Module):
-    def __init__(self,
-                 all_modules,
-                 all_modules_names,
-                 dim=2048,
-                 num_heads=32,
-                 inject_layer=[0, 27],
-                 root_net=None,
-                 enable_adain=False,
-                 adain_dim=2048,
-                 need_adain_ont=False):
-        super().__init__()
-        num_injector_layers = len(inject_layer)
-        self.injected_block_id = {}
-        audio_injector_id = 0
-        for mod_name, mod in zip(all_modules_names, all_modules):
-            if isinstance(mod, WanAttentionBlock):
-                for inject_id in inject_layer:
-                    if f'transformer_blocks.{inject_id}' in mod_name:
-                        self.injected_block_id[inject_id] = audio_injector_id
-                        audio_injector_id += 1
-        self.injector = nn.ModuleList([
-            AudioCrossAttention(
-                dim=dim,
-                num_heads=num_heads,
-                qk_norm=True,
-            ) for _ in range(audio_injector_id)
-        ])
-        self.injector_pre_norm_feat = nn.ModuleList([
-            nn.LayerNorm(
-                dim,
-                elementwise_affine=False,
-                eps=1e-6,
-            ) for _ in range(audio_injector_id)
-        ])
-        self.injector_pre_norm_vec = nn.ModuleList([
-            nn.LayerNorm(
-                dim,
-                elementwise_affine=False,
-                eps=1e-6,
-            ) for _ in range(audio_injector_id)
-        ])
-        if enable_adain:
-            self.injector_adain_layers = nn.ModuleList([
-                AdaLayerNorm(
-                    output_dim=dim * 2, embedding_dim=adain_dim, chunk_dim=1)
-                for _ in range(audio_injector_id)
-            ])
-            if need_adain_ont:
-                self.injector_adain_output_layers = nn.ModuleList(
-                    [nn.Linear(dim, dim) for _ in range(audio_injector_id)])
-class RMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return self._norm(x.float()).type_as(x) * self.weight
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
-class LayerNorm(nn.LayerNorm):
-    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
-        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
-    def forward(self, x):
-        return super().forward(x.float()).type_as(x)
-class SelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.eps = eps
-        # layers
-        self.q = nn.Linear(dim, dim)
-        self.k = nn.Linear(dim, dim)
-        self.v = nn.Linear(dim, dim)
-        self.o = nn.Linear(dim, dim)
-        self.norm_q = RMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.norm_k = RMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-    def forward(self, x, seq_lens, grid_sizes, freqs):
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        x = attention(
-            q=rope_apply(q, grid_sizes, freqs),
-            k=rope_apply(k, grid_sizes, freqs),
-            v=v,
-            k_lens=seq_lens,
-            window_size=self.window_size)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class SwinSelfAttention(SelfAttention):
-    def forward(self, x, seq_lens, grid_sizes, freqs):
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        assert b == 1, 'Only support batch_size 1'
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        q = rope_apply(q, grid_sizes, freqs)
-        k = rope_apply(k, grid_sizes, freqs)
-        T, H, W = grid_sizes[0].tolist()
-        q = rearrange(q, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        k = rearrange(k, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        v = rearrange(v, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        ref_q = q[-1:]
-        q = q[:-1]
-        ref_k = repeat(
-            k[-1:], "1 s n d -> t s n d", t=k.shape[0] - 1)  # t hw n d
-        k = k[:-1]
-        k = torch.cat([k[:1], k, k[-1:]])
-        k = torch.cat([k[1:-1], k[2:], k[:-2], ref_k], dim=1)  # (bt) (3hw) n d
-        ref_v = repeat(v[-1:], "1 s n d -> t s n d", t=v.shape[0] - 1)
-        v = v[:-1]
-        v = torch.cat([v[:1], v, v[-1:]])
-        v = torch.cat([v[1:-1], v[2:], v[:-2], ref_v], dim=1)
-        # q: b (t h w) n d
-        # k: b (t h w) n d
-        out = attention(
-            q=q,
-            k=k,
-            v=v,
-            # k_lens=torch.tensor([k.shape[1]] * k.shape[0], device=x.device, dtype=torch.long),
-            window_size=self.window_size)
-        out = torch.cat([out, ref_v[:1]], axis=0)
-        out = rearrange(out, '(b t) (h w) n d -> b (t h w) n d', t=T, h=H, w=W)
-        x = out
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-#Fix the reference frame RoPE to 1,H,W.
-#Set the current frame RoPE to 1.
-#Set the previous frame RoPE to 0.
-class CasualSelfAttention(SelfAttention):
-    def forward(self, x, seq_lens, grid_sizes, freqs):
-        shifting = 3
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        assert b == 1, 'Only support batch_size 1'
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        T, H, W = grid_sizes[0].tolist()
-        q = rearrange(q, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        k = rearrange(k, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        v = rearrange(v, 'b (t h w) n d -> (b t) (h w) n d', t=T, h=H, w=W)
-        ref_q = q[-1:]
-        q = q[:-1]
-        grid_sizes = torch.tensor([[1, H, W]] * q.shape[0], dtype=torch.long)
-        start = [[shifting, 0, 0]] * q.shape[0]
-        q = rope_apply(q, grid_sizes, freqs, start=start)
-        ref_k = k[-1:]
-        grid_sizes = torch.tensor([[1, H, W]], dtype=torch.long)
-        # start = [[shifting, H, W]]
-        start = [[shifting + 10, 0, 0]]
-        ref_k = rope_apply(ref_k, grid_sizes, freqs, start)
-        ref_k = repeat(
-            ref_k, "1 s n d -> t s n d", t=k.shape[0] - 1)  # t hw n d
-        k = k[:-1]
-        k = torch.cat([*([k[:1]] * shifting), k])
-        cat_k = []
-        for i in range(shifting):
-            cat_k.append(k[i:i - shifting])
-        cat_k.append(k[shifting:])
-        k = torch.cat(cat_k, dim=1)  # (bt) (3hw) n d
-        grid_sizes = torch.tensor(
-            [[shifting + 1, H, W]] * q.shape[0], dtype=torch.long)
-        k = rope_apply(k, grid_sizes, freqs)
-        k = torch.cat([k, ref_k], dim=1)
-        ref_v = repeat(v[-1:], "1 s n d -> t s n d", t=q.shape[0])  # t hw n d
-        v = v[:-1]
-        v = torch.cat([*([v[:1]] * shifting), v])
-        cat_v = []
-        for i in range(shifting):
-            cat_v.append(v[i:i - shifting])
-        cat_v.append(v[shifting:])
-        v = torch.cat(cat_v, dim=1)  # (bt) (3hw) n d
-        v = torch.cat([v, ref_v], dim=1)
-        # q: b (t h w) n d
-        # k: b (t h w) n d
-        outs = []
-        for i in range(q.shape[0]):
-            out = attention(
-                q=q[i:i + 1],
-                k=k[i:i + 1],
-                v=v[i:i + 1],
-                window_size=self.window_size)
-            outs.append(out)
-        out = torch.cat(outs, dim=0)
-        out = torch.cat([out, ref_v[:1]], axis=0)
-        out = rearrange(out, '(b t) (h w) n d -> b (t h w) n d', t=T, h=H, w=W)
-        x = out
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class MotionerAttentionBlock(nn.Module):
-    def __init__(self,
-                 dim,
-                 ffn_dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=False,
-                 eps=1e-6,
-                 self_attn_block="SelfAttention"):
-        super().__init__()
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # layers
-        self.norm1 = LayerNorm(dim, eps)
-        if self_attn_block == "SelfAttention":
-            self.self_attn = SelfAttention(dim, num_heads, window_size, qk_norm,
-                                           eps)
-        elif self_attn_block == "SwinSelfAttention":
-            self.self_attn = SwinSelfAttention(dim, num_heads, window_size,
-                                               qk_norm, eps)
-        elif self_attn_block == "CasualSelfAttention":
-            self.self_attn = CasualSelfAttention(dim, num_heads, window_size,
-                                                 qk_norm, eps)
-        self.norm2 = LayerNorm(dim, eps)
-        self.ffn = nn.Sequential(
-            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
-            nn.Linear(ffn_dim, dim))
-    def forward(
-        self,
-        x,
-        seq_lens,
-        grid_sizes,
-        freqs,
-    ):
-        # self-attention
-        y = self.self_attn(self.norm1(x).float(), seq_lens, grid_sizes, freqs)
-        x = x + y
-        y = self.ffn(self.norm2(x).float())
-        x = x + y
-        return x
-class Head(nn.Module):
-    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.out_dim = out_dim
-        self.patch_size = patch_size
-        self.eps = eps
-        # layers
-        out_dim = math.prod(patch_size) * out_dim
-        self.norm = LayerNorm(dim, eps)
-        self.head = nn.Linear(dim, out_dim)
-    def forward(self, x):
-        x = self.head(self.norm(x))
-        return x
-class MotionerTransformers(nn.Module, PeftAdapterMixin):
-    def __init__(
-        self,
-        patch_size=(1, 2, 2),
-        in_dim=16,
-        dim=2048,
-        ffn_dim=8192,
-        freq_dim=256,
-        out_dim=16,
-        num_heads=16,
-        num_layers=32,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=False,
-        eps=1e-6,
-        self_attn_block="SelfAttention",
-        motion_token_num=1024,
-        enable_tsm=False,
-        motion_stride=4,
-        expand_ratio=2,
-        trainable_token_pos_emb=False,
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.in_dim = in_dim
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.freq_dim = freq_dim
-        self.out_dim = out_dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        self.enable_tsm = enable_tsm
-        self.motion_stride = motion_stride
-        self.expand_ratio = expand_ratio
-        self.sample_c = self.patch_size[0]
-        # embeddings
-        self.patch_embedding = nn.Conv3d(
-            in_dim, dim, kernel_size=patch_size, stride=patch_size)
-        # blocks
-        self.blocks = nn.ModuleList([
-            MotionerAttentionBlock(
-                dim,
-                ffn_dim,
-                num_heads,
-                window_size,
-                qk_norm,
-                cross_attn_norm,
-                eps,
-                self_attn_block=self_attn_block) for _ in range(num_layers)
-        ])
-        # buffers (don't use register_buffer otherwise dtype will be changed in to())
-        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
-        d = dim // num_heads
-        self.freqs = torch.cat([
-            rope_params(1024, d - 4 * (d // 6)),
-            rope_params(1024, 2 * (d // 6)),
-            rope_params(1024, 2 * (d // 6))
-        ],
-                               dim=1)
-        self.gradient_checkpointing = False
-        self.motion_side_len = int(math.sqrt(motion_token_num))
-        assert self.motion_side_len**2 == motion_token_num
-        self.token = nn.Parameter(
-            torch.zeros(1, motion_token_num, dim).contiguous())
-        self.trainable_token_pos_emb = trainable_token_pos_emb
-        if trainable_token_pos_emb:
-            x = torch.zeros([1, motion_token_num, num_heads, d])
-            x[..., ::2] = 1
-            gride_sizes = [[
-                torch.tensor([0, 0, 0]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([1, self.motion_side_len,
-                              self.motion_side_len]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([1, self.motion_side_len,
-                              self.motion_side_len]).unsqueeze(0).repeat(1, 1),
-            ]]
-            token_freqs = rope_apply(x, gride_sizes, self.freqs)
-            token_freqs = token_freqs[0, :, 0].reshape(motion_token_num, -1, 2)
-            token_freqs = token_freqs * 0.01
-            self.token_freqs = torch.nn.Parameter(token_freqs)
-    def after_patch_embedding(self, x):
-        return x
-    def forward(
-        self,
-        x,
-    ):
-        """
-        x:              A list of videos each with shape [C, T, H, W].
-        t:              [B].
-        context:        A list of text embeddings each with shape [L, C].
-        """
-        # params
-        motion_frames = x[0].shape[1]
-        device = self.patch_embedding.weight.device
-        freqs = self.freqs
-        if freqs.device != device:
-            freqs = freqs.to(device)
-        if self.trainable_token_pos_emb:
-            with amp.autocast(dtype=torch.float64):
-                token_freqs = self.token_freqs.to(torch.float64)
-                token_freqs = token_freqs / token_freqs.norm(
-                    dim=-1, keepdim=True)
-                freqs = [freqs, torch.view_as_complex(token_freqs)]
-        if self.enable_tsm:
-            sample_idx = [
-                sample_indices(
-                    u.shape[1],
-                    stride=self.motion_stride,
-                    expand_ratio=self.expand_ratio,
-                    c=self.sample_c) for u in x
-            ]
-            x = [
-                torch.flip(torch.flip(u, [1])[:, idx], [1])
-                for idx, u in zip(sample_idx, x)
-            ]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        x = self.after_patch_embedding(x)
-        seq_f, seq_h, seq_w = x[0].shape[-3:]
-        batch_size = len(x)
-        if not self.enable_tsm:
-            grid_sizes = torch.stack(
-                [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-            grid_sizes = [[
-                torch.zeros_like(grid_sizes), grid_sizes, grid_sizes
-            ]]
-            seq_f = 0
-        else:
-            grid_sizes = []
-            for idx in sample_idx[0][::-1][::self.sample_c]:
-                tsm_frame_grid_sizes = [[
-                    torch.tensor([idx, 0,
-                                  0]).unsqueeze(0).repeat(batch_size, 1),
-                    torch.tensor([idx + 1, seq_h,
-                                  seq_w]).unsqueeze(0).repeat(batch_size, 1),
-                    torch.tensor([1, seq_h,
-                                  seq_w]).unsqueeze(0).repeat(batch_size, 1),
-                ]]
-                grid_sizes += tsm_frame_grid_sizes
-            seq_f = sample_idx[0][-1] + 1
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        x = torch.cat([u for u in x])
-        batch_size = len(x)
-        token_grid_sizes = [[
-            torch.tensor([seq_f, 0, 0]).unsqueeze(0).repeat(batch_size, 1),
-            torch.tensor(
-                [seq_f + 1, self.motion_side_len,
-                 self.motion_side_len]).unsqueeze(0).repeat(batch_size, 1),
-            torch.tensor(
-                [1 if not self.trainable_token_pos_emb else -1, seq_h,
-                 seq_w]).unsqueeze(0).repeat(batch_size, 1),
-        ]  # 第三行代表rope emb的想要覆盖到的范围
-                           ]
-        grid_sizes = grid_sizes + token_grid_sizes
-        token_unpatch_grid_sizes = torch.stack([
-            torch.tensor([1, 32, 32], dtype=torch.long)
-            for b in range(batch_size)
-        ])
-        token_len = self.token.shape[1]
-        token = self.token.clone().repeat(x.shape[0], 1, 1).contiguous()
-        seq_lens = seq_lens + torch.tensor([t.size(0) for t in token],
-                                           dtype=torch.long)
-        x = torch.cat([x, token], dim=1)
-        # arguments
-        kwargs = dict(
-            seq_lens=seq_lens,
-            grid_sizes=grid_sizes,
-            freqs=freqs,
-        )
-        # grad ckpt args
-        def create_custom_forward(module, return_dict=None):
-            def custom_forward(*inputs, **kwargs):
-                if return_dict is not None:
-                    return module(*inputs, **kwargs, return_dict=return_dict)
-                else:
-                    return module(*inputs, **kwargs)
-            return custom_forward
-        ckpt_kwargs: Dict[str, Any] = ({
-            "use_reentrant": False
-        } if is_torch_version(">=", "1.11.0") else {})
-        for idx, block in enumerate(self.blocks):
-            if self.training and self.gradient_checkpointing:
-                x = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    x,
-                    **kwargs,
-                    **ckpt_kwargs,
-                )
-            else:
-                x = block(x, **kwargs)
-        # head
-        out = x[:, -token_len:]
-        return out
-    def unpatchify(self, x, grid_sizes):
-        c = self.out_dim
-        out = []
-        for u, v in zip(x, grid_sizes.tolist()):
-            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
-            u = torch.einsum('fhwpqrc->cfphqwr', u)
-            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
-            out.append(u)
-        return out
-    def init_weights(self):
-        # basic init
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-        # init embeddings
-        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
-class FramePackMotioner(nn.Module):
-    def __init__(
-            self,
-            inner_dim=1024,
-            num_heads=16,  # Used to indicate the number of heads in the backbone network; unrelated to this module's design
-            zip_frame_buckets=[
-                1, 2, 16
-            ],  # Three numbers representing the number of frames sampled for patch operations from the nearest to the farthest frames
-            drop_mode="drop",  # If not "drop", it will use "padd", meaning padding instead of deletion
-            *args,
-            **kwargs):
-        super().__init__(*args, **kwargs)
-        self.proj = nn.Conv3d(
-            16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
-        self.proj_2x = nn.Conv3d(
-            16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
-        self.proj_4x = nn.Conv3d(
-            16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
-        self.zip_frame_buckets = torch.tensor(
-            zip_frame_buckets, dtype=torch.long)
-        self.inner_dim = inner_dim
-        self.num_heads = num_heads
-        assert (inner_dim %
-                num_heads) == 0 and (inner_dim // num_heads) % 2 == 0
-        d = inner_dim // num_heads
-        self.freqs = torch.cat([
-            rope_params(1024, d - 4 * (d // 6)),
-            rope_params(1024, 2 * (d // 6)),
-            rope_params(1024, 2 * (d // 6))
-        ],
-                               dim=1)
-        self.drop_mode = drop_mode
-    def forward(self, motion_latents, add_last_motion=2):
-        motion_frames = motion_latents[0].shape[1]
-        mot = []
-        mot_remb = []
-        for m in motion_latents:
-            lat_height, lat_width = m.shape[2], m.shape[3]
-            padd_lat = torch.zeros(16, self.zip_frame_buckets.sum(), lat_height,
-                                   lat_width).to(
-                                       device=m.device, dtype=m.dtype)
-            overlap_frame = min(padd_lat.shape[1], m.shape[1])
-            if overlap_frame > 0:
-                padd_lat[:, -overlap_frame:] = m[:, -overlap_frame:]
-            if add_last_motion < 2 and self.drop_mode != "drop":
-                zero_end_frame = self.zip_frame_buckets[:self.zip_frame_buckets.
-                                                        __len__() -
-                                                        add_last_motion -
-                                                        1].sum()
-                padd_lat[:, -zero_end_frame:] = 0
-            padd_lat = padd_lat.unsqueeze(0)
-            clean_latents_4x, clean_latents_2x, clean_latents_post = padd_lat[:, :, -self.zip_frame_buckets.sum(
-            ):, :, :].split(
-                list(self.zip_frame_buckets)[::-1], dim=2)  # 16, 2 ,1
-            # patchfy
-            clean_latents_post = self.proj(clean_latents_post).flatten(
-                2).transpose(1, 2)
-            clean_latents_2x = self.proj_2x(clean_latents_2x).flatten(
-                2).transpose(1, 2)
-            clean_latents_4x = self.proj_4x(clean_latents_4x).flatten(
-                2).transpose(1, 2)
-            if add_last_motion < 2 and self.drop_mode == "drop":
-                clean_latents_post = clean_latents_post[:, :
-                                                        0] if add_last_motion < 2 else clean_latents_post
-                clean_latents_2x = clean_latents_2x[:, :
-                                                    0] if add_last_motion < 1 else clean_latents_2x
-            motion_lat = torch.cat(
-                [clean_latents_post, clean_latents_2x, clean_latents_4x], dim=1)
-            # rope
-            start_time_id = -(self.zip_frame_buckets[:1].sum())
-            end_time_id = start_time_id + self.zip_frame_buckets[0]
-            grid_sizes = [] if add_last_motion < 2 and self.drop_mode == "drop" else \
-                        [
-                            [torch.tensor([start_time_id, 0, 0]).unsqueeze(0).repeat(1, 1),
-                            torch.tensor([end_time_id, lat_height // 2, lat_width // 2]).unsqueeze(0).repeat(1, 1),
-                            torch.tensor([self.zip_frame_buckets[0], lat_height // 2, lat_width // 2]).unsqueeze(0).repeat(1, 1), ]
-                        ]
-            start_time_id = -(self.zip_frame_buckets[:2].sum())
-            end_time_id = start_time_id + self.zip_frame_buckets[1] // 2
-            grid_sizes_2x = [] if add_last_motion < 1 and self.drop_mode == "drop" else \
-            [
-                [torch.tensor([start_time_id, 0, 0]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([end_time_id, lat_height // 4, lat_width // 4]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([self.zip_frame_buckets[1], lat_height // 2, lat_width // 2]).unsqueeze(0).repeat(1, 1), ]
-            ]
-            start_time_id = -(self.zip_frame_buckets[:3].sum())
-            end_time_id = start_time_id + self.zip_frame_buckets[2] // 4
-            grid_sizes_4x = [[
-                torch.tensor([start_time_id, 0, 0]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([end_time_id, lat_height // 8,
-                              lat_width // 8]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([
-                    self.zip_frame_buckets[2], lat_height // 2, lat_width // 2
-                ]).unsqueeze(0).repeat(1, 1),
-            ]]
-            grid_sizes = grid_sizes + grid_sizes_2x + grid_sizes_4x
-            motion_rope_emb = rope_precompute(
-                motion_lat.detach().view(1, motion_lat.shape[1], self.num_heads,
-                                         self.inner_dim // self.num_heads),
-                grid_sizes,
-                self.freqs,
-                start=None)
-            mot.append(motion_lat)
-            mot_remb.append(motion_rope_emb)
-        return mot, mot_remb
-def sample_indices(N, stride, expand_ratio, c):
-    indices = []
-    current_start = 0
-    while current_start < N:
-        bucket_width = int(stride * (expand_ratio**(len(indices) / stride)))
-        interval = int(bucket_width / stride * c)
-        current_end = min(N, current_start + bucket_width)
-        bucket_samples = []
-        for i in range(current_end - 1, current_start - 1, -interval):
-            for near in range(c):
-                bucket_samples.append(i - near)
-        indices += bucket_samples[::-1]
-        current_start += bucket_width
-    return indices

videox_fun/models/wan_camera_adapter.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-import torch.nn as nn
-class SimpleAdapter(nn.Module):
-    def __init__(self, in_dim, out_dim, kernel_size, stride, downscale_factor=8, num_residual_blocks=1):
-        super(SimpleAdapter, self).__init__()
-        # Pixel Unshuffle: reduce spatial dimensions by a factor of 8
-        self.pixel_unshuffle = nn.PixelUnshuffle(downscale_factor=downscale_factor)
-        # Convolution: reduce spatial dimensions by a factor
-        #  of 2 (without overlap)
-        self.conv = nn.Conv2d(in_dim * downscale_factor * downscale_factor, out_dim, kernel_size=kernel_size, stride=stride, padding=0)
-        # Residual blocks for feature extraction
-        self.residual_blocks = nn.Sequential(
-            *[ResidualBlock(out_dim) for _ in range(num_residual_blocks)]
-        )
-    def forward(self, x):
-        # Reshape to merge the frame dimension into batch
-        bs, c, f, h, w = x.size()
-        x = x.permute(0, 2, 1, 3, 4).contiguous().view(bs * f, c, h, w)
-        # Pixel Unshuffle operation
-        x_unshuffled = self.pixel_unshuffle(x)
-        # Convolution operation
-        x_conv = self.conv(x_unshuffled)
-        # Feature extraction with residual blocks
-        out = self.residual_blocks(x_conv)
-        # Reshape to restore original bf dimension
-        out = out.view(bs, f, out.size(1), out.size(2), out.size(3))
-        # Permute dimensions to reorder (if needed), e.g., swap channels and feature frames
-        out = out.permute(0, 2, 1, 3, 4)
-        return out
-class ResidualBlock(nn.Module):
-    def __init__(self, dim):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = nn.Conv2d(dim, dim, kernel_size=3, padding=1)
-    def forward(self, x):
-        residual = x
-        out = self.relu(self.conv1(x))
-        out = self.conv2(out)
-        out += residual
-        return out
-# Example usage
-# in_dim = 3
-# out_dim = 64
-# adapter = SimpleAdapterWithReshape(in_dim, out_dim)
-# x = torch.randn(1, in_dim, 4, 64, 64)  # e.g., batch size = 1, channels = 3, frames/features = 4
-# output = adapter(x)
-# print(output.shape)  # Should reflect transformed dimensions

videox_fun/models/wan_image_encoder.py DELETED Viewed

@@ -1,553 +0,0 @@
-# Modified from ``https://github.com/openai/CLIP'' and ``https://github.com/mlfoundations/open_clip''
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchvision.transforms as T
-from .attention_utils import attention, flash_attention
-from .wan_xlm_roberta import XLMRoberta
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.modeling_utils import ModelMixin
-__all__ = [
-    'XLMRobertaCLIP',
-    'clip_xlm_roberta_vit_h_14',
-    'CLIPModel',
-]
-def pos_interpolate(pos, seq_len):
-    if pos.size(1) == seq_len:
-        return pos
-    else:
-        src_grid = int(math.sqrt(pos.size(1)))
-        tar_grid = int(math.sqrt(seq_len))
-        n = pos.size(1) - src_grid * src_grid
-        return torch.cat([
-            pos[:, :n],
-            F.interpolate(
-                pos[:, n:].float().reshape(1, src_grid, src_grid, -1).permute(
-                    0, 3, 1, 2),
-                size=(tar_grid, tar_grid),
-                mode='bicubic',
-                align_corners=False).flatten(2).transpose(1, 2)
-        ],
-                         dim=1)
-class QuickGELU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(1.702 * x)
-class LayerNorm(nn.LayerNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type_as(x)
-class SelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 causal=False,
-                 attn_dropout=0.0,
-                 proj_dropout=0.0):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.causal = causal
-        self.attn_dropout = attn_dropout
-        self.proj_dropout = proj_dropout
-        # layers
-        self.to_qkv = nn.Linear(dim, dim * 3)
-        self.proj = nn.Linear(dim, dim)
-    def forward(self, x):
-        """
-        x:   [B, L, C].
-        """
-        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
-        # compute query, key, value
-        q, k, v = self.to_qkv(x).view(b, s, 3, n, d).unbind(2)
-        # compute attention
-        p = self.attn_dropout if self.training else 0.0
-        x = attention(q, k, v, dropout_p=p, causal=self.causal, attention_type="none")
-        x = x.reshape(b, s, c)
-        # output
-        x = self.proj(x)
-        x = F.dropout(x, self.proj_dropout, self.training)
-        return x
-class SwiGLU(nn.Module):
-    def __init__(self, dim, mid_dim):
-        super().__init__()
-        self.dim = dim
-        self.mid_dim = mid_dim
-        # layers
-        self.fc1 = nn.Linear(dim, mid_dim)
-        self.fc2 = nn.Linear(dim, mid_dim)
-        self.fc3 = nn.Linear(mid_dim, dim)
-    def forward(self, x):
-        x = F.silu(self.fc1(x)) * self.fc2(x)
-        x = self.fc3(x)
-        return x
-class AttentionBlock(nn.Module):
-    def __init__(self,
-                 dim,
-                 mlp_ratio,
-                 num_heads,
-                 post_norm=False,
-                 causal=False,
-                 activation='quick_gelu',
-                 attn_dropout=0.0,
-                 proj_dropout=0.0,
-                 norm_eps=1e-5):
-        assert activation in ['quick_gelu', 'gelu', 'swi_glu']
-        super().__init__()
-        self.dim = dim
-        self.mlp_ratio = mlp_ratio
-        self.num_heads = num_heads
-        self.post_norm = post_norm
-        self.causal = causal
-        self.norm_eps = norm_eps
-        # layers
-        self.norm1 = LayerNorm(dim, eps=norm_eps)
-        self.attn = SelfAttention(dim, num_heads, causal, attn_dropout,
-                                  proj_dropout)
-        self.norm2 = LayerNorm(dim, eps=norm_eps)
-        if activation == 'swi_glu':
-            self.mlp = SwiGLU(dim, int(dim * mlp_ratio))
-        else:
-            self.mlp = nn.Sequential(
-                nn.Linear(dim, int(dim * mlp_ratio)),
-                QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
-                nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
-    def forward(self, x):
-        if self.post_norm:
-            x = x + self.norm1(self.attn(x))
-            x = x + self.norm2(self.mlp(x))
-        else:
-            x = x + self.attn(self.norm1(x))
-            x = x + self.mlp(self.norm2(x))
-        return x
-class AttentionPool(nn.Module):
-    def __init__(self,
-                 dim,
-                 mlp_ratio,
-                 num_heads,
-                 activation='gelu',
-                 proj_dropout=0.0,
-                 norm_eps=1e-5):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.mlp_ratio = mlp_ratio
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.proj_dropout = proj_dropout
-        self.norm_eps = norm_eps
-        # layers
-        gain = 1.0 / math.sqrt(dim)
-        self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
-        self.to_q = nn.Linear(dim, dim)
-        self.to_kv = nn.Linear(dim, dim * 2)
-        self.proj = nn.Linear(dim, dim)
-        self.norm = LayerNorm(dim, eps=norm_eps)
-        self.mlp = nn.Sequential(
-            nn.Linear(dim, int(dim * mlp_ratio)),
-            QuickGELU() if activation == 'quick_gelu' else nn.GELU(),
-            nn.Linear(int(dim * mlp_ratio), dim), nn.Dropout(proj_dropout))
-    def forward(self, x):
-        """
-        x:  [B, L, C].
-        """
-        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.to_q(self.cls_embedding).view(1, 1, n, d).expand(b, -1, -1, -1)
-        k, v = self.to_kv(x).view(b, s, 2, n, d).unbind(2)
-        # compute attention
-        x = flash_attention(q, k, v, version=2)
-        x = x.reshape(b, 1, c)
-        # output
-        x = self.proj(x)
-        x = F.dropout(x, self.proj_dropout, self.training)
-        # mlp
-        x = x + self.mlp(self.norm(x))
-        return x[:, 0]
-class VisionTransformer(nn.Module):
-    def __init__(self,
-                 image_size=224,
-                 patch_size=16,
-                 dim=768,
-                 mlp_ratio=4,
-                 out_dim=512,
-                 num_heads=12,
-                 num_layers=12,
-                 pool_type='token',
-                 pre_norm=True,
-                 post_norm=False,
-                 activation='quick_gelu',
-                 attn_dropout=0.0,
-                 proj_dropout=0.0,
-                 embedding_dropout=0.0,
-                 norm_eps=1e-5):
-        if image_size % patch_size != 0:
-            print(
-                '[WARNING] image_size is not divisible by patch_size',
-                flush=True)
-        assert pool_type in ('token', 'token_fc', 'attn_pool')
-        out_dim = out_dim or dim
-        super().__init__()
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_patches = (image_size // patch_size)**2
-        self.dim = dim
-        self.mlp_ratio = mlp_ratio
-        self.out_dim = out_dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.pool_type = pool_type
-        self.post_norm = post_norm
-        self.norm_eps = norm_eps
-        # embeddings
-        gain = 1.0 / math.sqrt(dim)
-        self.patch_embedding = nn.Conv2d(
-            3,
-            dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=not pre_norm)
-        if pool_type in ('token', 'token_fc'):
-            self.cls_embedding = nn.Parameter(gain * torch.randn(1, 1, dim))
-        self.pos_embedding = nn.Parameter(gain * torch.randn(
-            1, self.num_patches +
-            (1 if pool_type in ('token', 'token_fc') else 0), dim))
-        self.dropout = nn.Dropout(embedding_dropout)
-        # transformer
-        self.pre_norm = LayerNorm(dim, eps=norm_eps) if pre_norm else None
-        self.transformer = nn.Sequential(*[
-            AttentionBlock(dim, mlp_ratio, num_heads, post_norm, False,
-                           activation, attn_dropout, proj_dropout, norm_eps)
-            for _ in range(num_layers)
-        ])
-        self.post_norm = LayerNorm(dim, eps=norm_eps)
-        # head
-        if pool_type == 'token':
-            self.head = nn.Parameter(gain * torch.randn(dim, out_dim))
-        elif pool_type == 'token_fc':
-            self.head = nn.Linear(dim, out_dim)
-        elif pool_type == 'attn_pool':
-            self.head = AttentionPool(dim, mlp_ratio, num_heads, activation,
-                                      proj_dropout, norm_eps)
-    def forward(self, x, interpolation=False, use_31_block=False):
-        b = x.size(0)
-        # embeddings
-        x = self.patch_embedding(x).flatten(2).permute(0, 2, 1)
-        if self.pool_type in ('token', 'token_fc'):
-            x = torch.cat([self.cls_embedding.expand(b, -1, -1), x], dim=1)
-        if interpolation:
-            e = pos_interpolate(self.pos_embedding, x.size(1))
-        else:
-            e = self.pos_embedding
-        x = self.dropout(x + e)
-        if self.pre_norm is not None:
-            x = self.pre_norm(x)
-        # transformer
-        if use_31_block:
-            x = self.transformer[:-1](x)
-            return x
-        else:
-            x = self.transformer(x)
-            return x
-class XLMRobertaWithHead(XLMRoberta):
-    def __init__(self, **kwargs):
-        self.out_dim = kwargs.pop('out_dim')
-        super().__init__(**kwargs)
-        # head
-        mid_dim = (self.dim + self.out_dim) // 2
-        self.head = nn.Sequential(
-            nn.Linear(self.dim, mid_dim, bias=False), nn.GELU(),
-            nn.Linear(mid_dim, self.out_dim, bias=False))
-    def forward(self, ids):
-        # xlm-roberta
-        x = super().forward(ids)
-        # average pooling
-        mask = ids.ne(self.pad_id).unsqueeze(-1).to(x)
-        x = (x * mask).sum(dim=1) / mask.sum(dim=1)
-        # head
-        x = self.head(x)
-        return x
-class XLMRobertaCLIP(nn.Module):
-    def __init__(self,
-                 embed_dim=1024,
-                 image_size=224,
-                 patch_size=14,
-                 vision_dim=1280,
-                 vision_mlp_ratio=4,
-                 vision_heads=16,
-                 vision_layers=32,
-                 vision_pool='token',
-                 vision_pre_norm=True,
-                 vision_post_norm=False,
-                 activation='gelu',
-                 vocab_size=250002,
-                 max_text_len=514,
-                 type_size=1,
-                 pad_id=1,
-                 text_dim=1024,
-                 text_heads=16,
-                 text_layers=24,
-                 text_post_norm=True,
-                 text_dropout=0.1,
-                 attn_dropout=0.0,
-                 proj_dropout=0.0,
-                 embedding_dropout=0.0,
-                 norm_eps=1e-5):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.vision_dim = vision_dim
-        self.vision_mlp_ratio = vision_mlp_ratio
-        self.vision_heads = vision_heads
-        self.vision_layers = vision_layers
-        self.vision_pre_norm = vision_pre_norm
-        self.vision_post_norm = vision_post_norm
-        self.activation = activation
-        self.vocab_size = vocab_size
-        self.max_text_len = max_text_len
-        self.type_size = type_size
-        self.pad_id = pad_id
-        self.text_dim = text_dim
-        self.text_heads = text_heads
-        self.text_layers = text_layers
-        self.text_post_norm = text_post_norm
-        self.norm_eps = norm_eps
-        # models
-        self.visual = VisionTransformer(
-            image_size=image_size,
-            patch_size=patch_size,
-            dim=vision_dim,
-            mlp_ratio=vision_mlp_ratio,
-            out_dim=embed_dim,
-            num_heads=vision_heads,
-            num_layers=vision_layers,
-            pool_type=vision_pool,
-            pre_norm=vision_pre_norm,
-            post_norm=vision_post_norm,
-            activation=activation,
-            attn_dropout=attn_dropout,
-            proj_dropout=proj_dropout,
-            embedding_dropout=embedding_dropout,
-            norm_eps=norm_eps)
-        self.textual = XLMRobertaWithHead(
-            vocab_size=vocab_size,
-            max_seq_len=max_text_len,
-            type_size=type_size,
-            pad_id=pad_id,
-            dim=text_dim,
-            out_dim=embed_dim,
-            num_heads=text_heads,
-            num_layers=text_layers,
-            post_norm=text_post_norm,
-            dropout=text_dropout)
-        self.log_scale = nn.Parameter(math.log(1 / 0.07) * torch.ones([]))
-    def forward(self, imgs, txt_ids):
-        """
-        imgs:       [B, 3, H, W] of torch.float32.
-        - mean:     [0.48145466, 0.4578275, 0.40821073]
-        - std:      [0.26862954, 0.26130258, 0.27577711]
-        txt_ids:    [B, L] of torch.long.
-                    Encoded by data.CLIPTokenizer.
-        """
-        xi = self.visual(imgs)
-        xt = self.textual(txt_ids)
-        return xi, xt
-    def param_groups(self):
-        groups = [{
-            'params': [
-                p for n, p in self.named_parameters()
-                if 'norm' in n or n.endswith('bias')
-            ],
-            'weight_decay': 0.0
-        }, {
-            'params': [
-                p for n, p in self.named_parameters()
-                if not ('norm' in n or n.endswith('bias'))
-            ]
-        }]
-        return groups
-def _clip(pretrained=False,
-          pretrained_name=None,
-          model_cls=XLMRobertaCLIP,
-          return_transforms=False,
-          return_tokenizer=False,
-          tokenizer_padding='eos',
-          dtype=torch.float32,
-          device='cpu',
-          **kwargs):
-    # init a model on device
-    with torch.device(device):
-        model = model_cls(**kwargs)
-    # set device
-    model = model.to(dtype=dtype, device=device)
-    output = (model,)
-    # init transforms
-    if return_transforms:
-        # mean and std
-        if 'siglip' in pretrained_name.lower():
-            mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
-        else:
-            mean = [0.48145466, 0.4578275, 0.40821073]
-            std = [0.26862954, 0.26130258, 0.27577711]
-        # transforms
-        transforms = T.Compose([
-            T.Resize((model.image_size, model.image_size),
-                     interpolation=T.InterpolationMode.BICUBIC),
-            T.ToTensor(),
-            T.Normalize(mean=mean, std=std)
-        ])
-        output += (transforms,)
-    return output[0] if len(output) == 1 else output
-def clip_xlm_roberta_vit_h_14(
-        pretrained=False,
-        pretrained_name='open-clip-xlm-roberta-large-vit-huge-14',
-        **kwargs):
-    cfg = dict(
-        embed_dim=1024,
-        image_size=224,
-        patch_size=14,
-        vision_dim=1280,
-        vision_mlp_ratio=4,
-        vision_heads=16,
-        vision_layers=32,
-        vision_pool='token',
-        activation='gelu',
-        vocab_size=250002,
-        max_text_len=514,
-        type_size=1,
-        pad_id=1,
-        text_dim=1024,
-        text_heads=16,
-        text_layers=24,
-        text_post_norm=True,
-        text_dropout=0.1,
-        attn_dropout=0.0,
-        proj_dropout=0.0,
-        embedding_dropout=0.0)
-    cfg.update(**kwargs)
-    return _clip(pretrained, pretrained_name, XLMRobertaCLIP, **cfg)
-class CLIPModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    def __init__(self):
-        super(CLIPModel, self).__init__()
-        # init model
-        self.model, self.transforms = clip_xlm_roberta_vit_h_14(
-            pretrained=False,
-            return_transforms=True,
-            return_tokenizer=False)
-    def forward(self, videos):
-        # preprocess
-        size = (self.model.image_size,) * 2
-        videos = torch.cat([
-            F.interpolate(
-                u.transpose(0, 1),
-                size=size,
-                mode='bicubic',
-                align_corners=False) for u in videos
-        ])
-        videos = self.transforms.transforms[-1](videos.mul_(0.5).add_(0.5))
-        # forward
-        with torch.cuda.amp.autocast(dtype=self.dtype):
-            out = self.model.visual(videos, use_31_block=True)
-            return out
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, transformer_additional_kwargs={}):
-        def filter_kwargs(cls, kwargs):
-            import inspect
-            sig = inspect.signature(cls.__init__)
-            valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
-            filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            return filtered_kwargs
-        model = cls(**filter_kwargs(cls, transformer_additional_kwargs))
-        if pretrained_model_path.endswith(".safetensors"):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(pretrained_model_path)
-        else:
-            state_dict = torch.load(pretrained_model_path, map_location="cpu")
-        tmp_state_dict = {}
-        for key in state_dict:
-            tmp_state_dict["model." + key] = state_dict[key]
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        return model

videox_fun/models/wan_text_encoder.py DELETED Viewed

@@ -1,395 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/t5.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-from typing import Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.modeling_utils import ModelMixin
-def fp16_clamp(x):
-    if x.dtype == torch.float16 and torch.isinf(x).any():
-        clamp = torch.finfo(x.dtype).max - 1000
-        x = torch.clamp(x, min=-clamp, max=clamp)
-    return x
-def init_weights(m):
-    if isinstance(m, T5LayerNorm):
-        nn.init.ones_(m.weight)
-    elif isinstance(m, T5FeedForward):
-        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
-    elif isinstance(m, T5Attention):
-        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
-        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
-    elif isinstance(m, T5RelativeEmbedding):
-        nn.init.normal_(
-            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
-class GELU(nn.Module):
-    def forward(self, x):
-        return 0.5 * x * (1.0 + torch.tanh(
-            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
-class T5LayerNorm(nn.Module):
-    def __init__(self, dim, eps=1e-6):
-        super(T5LayerNorm, self).__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
-                            self.eps)
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            x = x.type_as(self.weight)
-        return self.weight * x
-class T5Attention(nn.Module):
-    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
-        assert dim_attn % num_heads == 0
-        super(T5Attention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.num_heads = num_heads
-        self.head_dim = dim_attn // num_heads
-        # layers
-        self.q = nn.Linear(dim, dim_attn, bias=False)
-        self.k = nn.Linear(dim, dim_attn, bias=False)
-        self.v = nn.Linear(dim, dim_attn, bias=False)
-        self.o = nn.Linear(dim_attn, dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x, context=None, mask=None, pos_bias=None):
-        """
-        x:          [B, L1, C].
-        context:    [B, L2, C] or None.
-        mask:       [B, L2] or [B, L1, L2] or None.
-        """
-        # check inputs
-        context = x if context is None else context
-        b, n, c = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.q(x).view(b, -1, n, c)
-        k = self.k(context).view(b, -1, n, c)
-        v = self.v(context).view(b, -1, n, c)
-        # attention bias
-        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
-        if pos_bias is not None:
-            attn_bias += pos_bias
-        if mask is not None:
-            assert mask.ndim in [2, 3]
-            mask = mask.view(b, 1, 1,
-                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
-            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
-        # compute attention (T5 does not use scaling)
-        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
-        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
-        x = torch.einsum('bnij,bjnc->binc', attn, v)
-        # output
-        x = x.reshape(b, -1, n * c)
-        x = self.o(x)
-        x = self.dropout(x)
-        return x
-class T5FeedForward(nn.Module):
-    def __init__(self, dim, dim_ffn, dropout=0.1):
-        super(T5FeedForward, self).__init__()
-        self.dim = dim
-        self.dim_ffn = dim_ffn
-        # layers
-        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
-        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
-        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        x = self.fc1(x) * self.gate(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        x = self.dropout(x)
-        return x
-class T5SelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5SelfAttention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.norm1 = T5LayerNorm(dim)
-        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm2 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=True)
-    def forward(self, x, mask=None, pos_bias=None):
-        e = pos_bias if self.shared_pos else self.pos_embedding(
-            x.size(1), x.size(1))
-        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
-        x = fp16_clamp(x + self.ffn(self.norm2(x)))
-        return x
-class T5CrossAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5CrossAttention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.norm1 = T5LayerNorm(dim)
-        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm2 = T5LayerNorm(dim)
-        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm3 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=False)
-    def forward(self,
-                x,
-                mask=None,
-                encoder_states=None,
-                encoder_mask=None,
-                pos_bias=None):
-        e = pos_bias if self.shared_pos else self.pos_embedding(
-            x.size(1), x.size(1))
-        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
-        x = fp16_clamp(x + self.cross_attn(
-            self.norm2(x), context=encoder_states, mask=encoder_mask))
-        x = fp16_clamp(x + self.ffn(self.norm3(x)))
-        return x
-class T5RelativeEmbedding(nn.Module):
-    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
-        super(T5RelativeEmbedding, self).__init__()
-        self.num_buckets = num_buckets
-        self.num_heads = num_heads
-        self.bidirectional = bidirectional
-        self.max_dist = max_dist
-        # layers
-        self.embedding = nn.Embedding(num_buckets, num_heads)
-    def forward(self, lq, lk):
-        device = self.embedding.weight.device
-        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
-        #     torch.arange(lq).unsqueeze(1).to(device)
-        if torch.device(type="meta") != device:
-            rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
-                torch.arange(lq, device=device).unsqueeze(1)
-        else:
-            rel_pos = torch.arange(lk).unsqueeze(0) - \
-                torch.arange(lq).unsqueeze(1)
-        rel_pos = self._relative_position_bucket(rel_pos)
-        rel_pos_embeds = self.embedding(rel_pos)
-        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
-            0)  # [1, N, Lq, Lk]
-        return rel_pos_embeds.contiguous()
-    def _relative_position_bucket(self, rel_pos):
-        # preprocess
-        if self.bidirectional:
-            num_buckets = self.num_buckets // 2
-            rel_buckets = (rel_pos > 0).long() * num_buckets
-            rel_pos = torch.abs(rel_pos)
-        else:
-            num_buckets = self.num_buckets
-            rel_buckets = 0
-            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
-        # embeddings for small and large positions
-        max_exact = num_buckets // 2
-        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
-                                     math.log(self.max_dist / max_exact) *
-                                     (num_buckets - max_exact)).long()
-        rel_pos_large = torch.min(
-            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
-        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
-        return rel_buckets
-class WanT5EncoderModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    def __init__(self,
-                 vocab,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_layers,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(WanT5EncoderModel, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
-            else nn.Embedding(vocab, dim)
-        self.pos_embedding = T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=True) if shared_pos else None
-        self.dropout = nn.Dropout(dropout)
-        self.blocks = nn.ModuleList([
-            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
-                            shared_pos, dropout) for _ in range(num_layers)
-        ])
-        self.norm = T5LayerNorm(dim)
-        # initialize weights
-        self.apply(init_weights)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-    ):
-        x = self.token_embedding(input_ids)
-        x = self.dropout(x)
-        e = self.pos_embedding(x.size(1),
-                               x.size(1)) if self.shared_pos else None
-        for block in self.blocks:
-            x = block(x, attention_mask, pos_bias=e)
-        x = self.norm(x)
-        x = self.dropout(x)
-        return (x, )
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, additional_kwargs={}, low_cpu_mem_usage=False, torch_dtype=torch.bfloat16):
-        def filter_kwargs(cls, kwargs):
-            import inspect
-            sig = inspect.signature(cls.__init__)
-            valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
-            filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            return filtered_kwargs
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls(**filter_kwargs(cls, additional_kwargs))
-                param_device = "cpu"
-                if pretrained_model_path.endswith(".safetensors"):
-                    from safetensors.torch import load_file
-                    state_dict = load_file(pretrained_model_path)
-                else:
-                    state_dict = torch.load(pretrained_model_path, map_location="cpu")
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    # move the params from meta device to cpu
-                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-                    if len(missing_keys) > 0:
-                        raise ValueError(
-                            f"Cannot load {cls} from {pretrained_model_path} because the following keys are"
-                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
-                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
-                            " those weights or else make sure your checkpoint file is correct."
-                        )
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls(**filter_kwargs(cls, additional_kwargs))
-        if pretrained_model_path.endswith(".safetensors"):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(pretrained_model_path)
-        else:
-            state_dict = torch.load(pretrained_model_path, map_location="cpu")
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        model = model.to(torch_dtype)
-        return model

videox_fun/models/wan_transformer3d.py DELETED Viewed

@@ -1,1394 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import glob
-import json
-import math
-import os
-import types
-import warnings
-from typing import Any, Dict, Optional, Union
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils import is_torch_version, logging
-from torch import nn
-from ..dist import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    usp_attn_forward, xFuserLongContextAttention)
-from ..utils import cfg_skip
-from .attention_utils import attention
-from .cache_utils import TeaCache
-from .wan_camera_adapter import SimpleAdapter
-def sinusoidal_embedding_1d(dim, position):
-    # preprocess
-    assert dim % 2 == 0
-    half = dim // 2
-    position = position.type(torch.float64)
-    # calculation
-    sinusoid = torch.outer(
-        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
-    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
-    return x
-@amp.autocast(enabled=False)
-def rope_params(max_seq_len, dim, theta=10000):
-    assert dim % 2 == 0
-    freqs = torch.outer(
-        torch.arange(max_seq_len),
-        1.0 / torch.pow(theta,
-                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
-    freqs = torch.polar(torch.ones_like(freqs), freqs)
-    return freqs
-# modified from https://github.com/thu-ml/RIFLEx/blob/main/riflex_utils.py
-@amp.autocast(enabled=False)
-def get_1d_rotary_pos_embed_riflex(
-    pos: Union[np.ndarray, int],
-    dim: int,
-    theta: float = 10000.0,
-    use_real=False,
-    k: Optional[int] = None,
-    L_test: Optional[int] = None,
-    L_test_scale: Optional[int] = None,
-):
-    """
-    RIFLEx: Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
-    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
-    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
-    data type.
-    Args:
-        dim (`int`): Dimension of the frequency tensor.
-        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
-        theta (`float`, *optional*, defaults to 10000.0):
-            Scaling factor for frequency computation. Defaults to 10000.0.
-        use_real (`bool`, *optional*):
-            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
-        k (`int`, *optional*, defaults to None): the index for the intrinsic frequency in RoPE
-        L_test (`int`, *optional*, defaults to None): the number of frames for inference
-    Returns:
-        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
-    """
-    assert dim % 2 == 0
-    if isinstance(pos, int):
-        pos = torch.arange(pos)
-    if isinstance(pos, np.ndarray):
-        pos = torch.from_numpy(pos)  # type: ignore  # [S]
-    freqs = 1.0 / torch.pow(theta,
-        torch.arange(0, dim, 2).to(torch.float64).div(dim))
-    # === Riflex modification start ===
-    # Reduce the intrinsic frequency to stay within a single period after extrapolation (see Eq. (8)).
-    # Empirical observations show that a few videos may exhibit repetition in the tail frames.
-    # To be conservative, we multiply by 0.9 to keep the extrapolated length below 90% of a single period.
-    if k is not None:
-        freqs[k-1] = 0.9 * 2 * torch.pi / L_test
-    # === Riflex modification end ===
-    if L_test_scale is not None:
-        freqs[k-1] = freqs[k-1] / L_test_scale
-    freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
-    if use_real:
-        freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float()  # [S, D]
-        freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float()  # [S, D]
-        return freqs_cos, freqs_sin
-    else:
-        # lumina
-        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
-        return freqs_cis
-# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
-def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
-    tw = tgt_width
-    th = tgt_height
-    h, w = src
-    r = h / w
-    if r > (th / tw):
-        resize_height = th
-        resize_width = int(round(th / h * w))
-    else:
-        resize_width = tw
-        resize_height = int(round(tw / w * h))
-    crop_top = int(round((th - resize_height) / 2.0))
-    crop_left = int(round((tw - resize_width) / 2.0))
-    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
-@amp.autocast(enabled=False)
-@torch.compiler.disable()
-def rope_apply(x, grid_sizes, freqs):
-    n, c = x.size(2), x.size(3) // 2
-    # split freqs
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
-        seq_len = f * h * w
-        # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float32).reshape(
-            seq_len, n, -1, 2))
-        freqs_i = torch.cat([
-            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
-            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
-            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
-        ],
-                            dim=-1).reshape(seq_len, 1, -1)
-        # apply rotary embedding
-        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
-        x_i = torch.cat([x_i, x[i, seq_len:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output).to(x.dtype)
-def rope_apply_qk(q, k, grid_sizes, freqs):
-    q = rope_apply(q, grid_sizes, freqs)
-    k = rope_apply(k, grid_sizes, freqs)
-    return q, k
-class WanRMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-        """
-        return self._norm(x) * self.weight
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps).to(x.dtype)
-class WanLayerNorm(nn.LayerNorm):
-    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
-        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
-    def forward(self, x):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-        """
-        return super().forward(x)
-class WanSelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.eps = eps
-        # layers
-        self.q = nn.Linear(dim, dim)
-        self.k = nn.Linear(dim, dim)
-        self.v = nn.Linear(dim, dim)
-        self.o = nn.Linear(dim, dim)
-        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-    def forward(self, x, seq_lens, grid_sizes, freqs, dtype=torch.bfloat16, t=0):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, num_heads, C / num_heads]
-            seq_lens(Tensor): Shape [B]
-            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x.to(dtype))).view(b, s, n, d)
-            k = self.norm_k(self.k(x.to(dtype))).view(b, s, n, d)
-            v = self.v(x.to(dtype)).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        q, k = rope_apply_qk(q, k, grid_sizes, freqs)
-        x = attention(
-            q.to(dtype),
-            k.to(dtype),
-            v=v.to(dtype),
-            k_lens=seq_lens,
-            window_size=self.window_size)
-        x = x.to(dtype)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class WanT2VCrossAttention(WanSelfAttention):
-    def forward(self, x, context, context_lens, dtype=torch.bfloat16, t=0):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-            context_lens(Tensor): Shape [B]
-        """
-        b, n, d = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.norm_q(self.q(x.to(dtype))).view(b, -1, n, d)
-        k = self.norm_k(self.k(context.to(dtype))).view(b, -1, n, d)
-        v = self.v(context.to(dtype)).view(b, -1, n, d)
-        # compute attention
-        x = attention(
-            q.to(dtype),
-            k.to(dtype),
-            v.to(dtype),
-            k_lens=context_lens
-        )
-        x = x.to(dtype)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class WanI2VCrossAttention(WanSelfAttention):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6):
-        super().__init__(dim, num_heads, window_size, qk_norm, eps)
-        self.k_img = nn.Linear(dim, dim)
-        self.v_img = nn.Linear(dim, dim)
-        # self.alpha = nn.Parameter(torch.zeros((1, )))
-        self.norm_k_img = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-    def forward(self, x, context, context_lens, dtype=torch.bfloat16, t=0):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-            context_lens(Tensor): Shape [B]
-        """
-        context_img = context[:, :257]
-        context = context[:, 257:]
-        b, n, d = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.norm_q(self.q(x.to(dtype))).view(b, -1, n, d)
-        k = self.norm_k(self.k(context.to(dtype))).view(b, -1, n, d)
-        v = self.v(context.to(dtype)).view(b, -1, n, d)
-        k_img = self.norm_k_img(self.k_img(context_img.to(dtype))).view(b, -1, n, d)
-        v_img = self.v_img(context_img.to(dtype)).view(b, -1, n, d)
-        img_x = attention(
-            q.to(dtype),
-            k_img.to(dtype),
-            v_img.to(dtype),
-            k_lens=None
-        )
-        img_x = img_x.to(dtype)
-        # compute attention
-        x = attention(
-            q.to(dtype),
-            k.to(dtype),
-            v.to(dtype),
-            k_lens=context_lens
-        )
-        x = x.to(dtype)
-        # output
-        x = x.flatten(2)
-        img_x = img_x.flatten(2)
-        x = x + img_x
-        x = self.o(x)
-        return x
-class WanCrossAttention(WanSelfAttention):
-    def forward(self, x, context, context_lens, dtype=torch.bfloat16, t=0):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-            context_lens(Tensor): Shape [B]
-        """
-        b, n, d = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.norm_q(self.q(x.to(dtype))).view(b, -1, n, d)
-        k = self.norm_k(self.k(context.to(dtype))).view(b, -1, n, d)
-        v = self.v(context.to(dtype)).view(b, -1, n, d)
-        # compute attention
-        x = attention(q.to(dtype), k.to(dtype), v.to(dtype), k_lens=context_lens)
-        # output
-        x = x.flatten(2)
-        x = self.o(x.to(dtype))
-        return x
-WAN_CROSSATTENTION_CLASSES = {
-    't2v_cross_attn': WanT2VCrossAttention,
-    'i2v_cross_attn': WanI2VCrossAttention,
-    'cross_attn': WanCrossAttention,
-}
-class WanAttentionBlock(nn.Module):
-    def __init__(self,
-                 cross_attn_type,
-                 dim,
-                 ffn_dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=False,
-                 eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # layers
-        self.norm1 = WanLayerNorm(dim, eps)
-        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
-                                          eps)
-        self.norm3 = WanLayerNorm(
-            dim, eps,
-            elementwise_affine=True) if cross_attn_norm else nn.Identity()
-        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim,
-                                                                      num_heads,
-                                                                      (-1, -1),
-                                                                      qk_norm,
-                                                                      eps)
-        self.norm2 = WanLayerNorm(dim, eps)
-        self.ffn = nn.Sequential(
-            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
-            nn.Linear(ffn_dim, dim))
-        # modulation
-        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
-    def forward(
-        self,
-        x,
-        e,
-        seq_lens,
-        grid_sizes,
-        freqs,
-        context,
-        context_lens,
-        dtype=torch.bfloat16,
-        t=0,
-    ):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-            e(Tensor): Shape [B, 6, C]
-            seq_lens(Tensor): Shape [B], length of each sequence in batch
-            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        if e.dim() > 3:
-            e = (self.modulation.unsqueeze(0) + e).chunk(6, dim=2)
-            e = [e.squeeze(2) for e in e]
-        else:
-            e = (self.modulation + e).chunk(6, dim=1)
-        # self-attention
-        temp_x = self.norm1(x) * (1 + e[1]) + e[0]
-        temp_x = temp_x.to(dtype)
-        y = self.self_attn(temp_x, seq_lens, grid_sizes, freqs, dtype, t=t)
-        x = x + y * e[2]
-        # cross-attention & ffn function
-        def cross_attn_ffn(x, context, context_lens, e):
-            # cross-attention
-            x = x + self.cross_attn(self.norm3(x), context, context_lens, dtype, t=t)
-            # ffn function
-            temp_x = self.norm2(x) * (1 + e[4]) + e[3]
-            temp_x = temp_x.to(dtype)
-            y = self.ffn(temp_x)
-            x = x + y * e[5]
-            return x
-        x = cross_attn_ffn(x, context, context_lens, e)
-        return x
-class Head(nn.Module):
-    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.out_dim = out_dim
-        self.patch_size = patch_size
-        self.eps = eps
-        # layers
-        out_dim = math.prod(patch_size) * out_dim
-        self.norm = WanLayerNorm(dim, eps)
-        self.head = nn.Linear(dim, out_dim)
-        # modulation
-        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
-    def forward(self, x, e):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            e(Tensor): Shape [B, C]
-        """
-        if e.dim() > 2:
-            e = (self.modulation.unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2)
-            e = [e.squeeze(2) for e in e]
-        else:
-            e = (self.modulation + e.unsqueeze(1)).chunk(2, dim=1)
-        x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
-        return x
-class MLPProj(torch.nn.Module):
-    def __init__(self, in_dim, out_dim):
-        super().__init__()
-        self.proj = torch.nn.Sequential(
-            torch.nn.LayerNorm(in_dim), torch.nn.Linear(in_dim, in_dim),
-            torch.nn.GELU(), torch.nn.Linear(in_dim, out_dim),
-            torch.nn.LayerNorm(out_dim))
-    def forward(self, image_embeds):
-        clip_extra_context_tokens = self.proj(image_embeds)
-        return clip_extra_context_tokens
-class WanTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    r"""
-    Wan diffusion backbone supporting both text-to-video and image-to-video.
-    """
-    # ignore_for_config = [
-    #     'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
-    # ]
-    # _no_split_modules = ['WanAttentionBlock']
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        model_type='t2v',
-        patch_size=(1, 2, 2),
-        text_len=512,
-        in_dim=16,
-        dim=2048,
-        ffn_dim=8192,
-        freq_dim=256,
-        text_dim=4096,
-        out_dim=16,
-        num_heads=16,
-        num_layers=32,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=True,
-        eps=1e-6,
-        in_channels=16,
-        hidden_size=2048,
-        add_control_adapter=False,
-        in_dim_control_adapter=24,
-        downscale_factor_control_adapter=8,
-        add_ref_conv=False,
-        in_dim_ref_conv=16,
-        cross_attn_type=None,
-    ):
-        r"""
-        Initialize the diffusion model backbone.
-        Args:
-            model_type (`str`, *optional*, defaults to 't2v'):
-                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
-            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
-                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
-            text_len (`int`, *optional*, defaults to 512):
-                Fixed length for text embeddings
-            in_dim (`int`, *optional*, defaults to 16):
-                Input video channels (C_in)
-            dim (`int`, *optional*, defaults to 2048):
-                Hidden dimension of the transformer
-            ffn_dim (`int`, *optional*, defaults to 8192):
-                Intermediate dimension in feed-forward network
-            freq_dim (`int`, *optional*, defaults to 256):
-                Dimension for sinusoidal time embeddings
-            text_dim (`int`, *optional*, defaults to 4096):
-                Input dimension for text embeddings
-            out_dim (`int`, *optional*, defaults to 16):
-                Output video channels (C_out)
-            num_heads (`int`, *optional*, defaults to 16):
-                Number of attention heads
-            num_layers (`int`, *optional*, defaults to 32):
-                Number of transformer blocks
-            window_size (`tuple`, *optional*, defaults to (-1, -1)):
-                Window size for local attention (-1 indicates global attention)
-            qk_norm (`bool`, *optional*, defaults to True):
-                Enable query/key normalization
-            cross_attn_norm (`bool`, *optional*, defaults to False):
-                Enable cross-attention normalization
-            eps (`float`, *optional*, defaults to 1e-6):
-                Epsilon value for normalization layers
-        """
-        super().__init__()
-        # assert model_type in ['t2v', 'i2v', 'ti2v']
-        self.model_type = model_type
-        self.patch_size = patch_size
-        self.text_len = text_len
-        self.in_dim = in_dim
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.freq_dim = freq_dim
-        self.text_dim = text_dim
-        self.out_dim = out_dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # embeddings
-        self.patch_embedding = nn.Conv3d(
-            in_dim, dim, kernel_size=patch_size, stride=patch_size)
-        self.text_embedding = nn.Sequential(
-            nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
-            nn.Linear(dim, dim))
-        self.time_embedding = nn.Sequential(
-            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
-        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
-        # blocks
-        if cross_attn_type is None:
-            cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
-        self.blocks = nn.ModuleList([
-            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
-                              window_size, qk_norm, cross_attn_norm, eps)
-            for _ in range(num_layers)
-        ])
-        for layer_idx, block in enumerate(self.blocks):
-            block.self_attn.layer_idx = layer_idx
-            block.self_attn.num_layers = self.num_layers
-        # head
-        self.head = Head(dim, out_dim, patch_size, eps)
-        # buffers (don't use register_buffer otherwise dtype will be changed in to())
-        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
-        d = dim // num_heads
-        self.d = d
-        self.dim = dim
-        self.freqs = torch.cat(
-            [
-                rope_params(1024, d - 4 * (d // 6)),
-                rope_params(1024, 2 * (d // 6)),
-                rope_params(1024, 2 * (d // 6))
-            ],
-            dim=1
-        )
-        if model_type == 'i2v':
-            self.img_emb = MLPProj(1280, dim)
-        if add_control_adapter:
-            self.control_adapter = SimpleAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], downscale_factor=downscale_factor_control_adapter)
-        else:
-            self.control_adapter = None
-        if add_ref_conv:
-            self.ref_conv = nn.Conv2d(in_dim_ref_conv, dim, kernel_size=patch_size[1:], stride=patch_size[1:])
-        else:
-            self.ref_conv = None
-        self.teacache = None
-        self.cfg_skip_ratio = None
-        self.current_steps = 0
-        self.num_inference_steps = None
-        self.gradient_checkpointing = False
-        self.all_gather = None
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-        self.init_weights()
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-            if hasattr(self, "motioner") and hasattr(self.motioner, "gradient_checkpointing"):
-                self.motioner.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-            if hasattr(self, "motioner") and hasattr(self.motioner, "gradient_checkpointing"):
-                self.motioner.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_teacache(
-        self,
-        coefficients,
-        num_steps: int,
-        rel_l1_thresh: float,
-        num_skip_start_steps: int = 0,
-        offload: bool = True,
-    ):
-        self.teacache = TeaCache(
-            coefficients, num_steps, rel_l1_thresh=rel_l1_thresh, num_skip_start_steps=num_skip_start_steps, offload=offload
-        )
-    def share_teacache(
-        self,
-        transformer = None,
-    ):
-        self.teacache = transformer.teacache
-    def disable_teacache(self):
-        self.teacache = None
-    def enable_cfg_skip(self, cfg_skip_ratio, num_steps):
-        if cfg_skip_ratio != 0:
-            self.cfg_skip_ratio = cfg_skip_ratio
-            self.current_steps = 0
-            self.num_inference_steps = num_steps
-        else:
-            self.cfg_skip_ratio = None
-            self.current_steps = 0
-            self.num_inference_steps = None
-    def share_cfg_skip(
-        self,
-        transformer = None,
-    ):
-        self.cfg_skip_ratio = transformer.cfg_skip_ratio
-        self.current_steps = transformer.current_steps
-        self.num_inference_steps = transformer.num_inference_steps
-    def disable_cfg_skip(self):
-        self.cfg_skip_ratio = None
-        self.current_steps = 0
-        self.num_inference_steps = None
-    def enable_riflex(
-        self,
-        k = 6,
-        L_test = 66,
-        L_test_scale = 4.886,
-    ):
-        device = self.freqs.device
-        self.freqs = torch.cat(
-            [
-                get_1d_rotary_pos_embed_riflex(1024, self.d - 4 * (self.d // 6), use_real=False, k=k, L_test=L_test, L_test_scale=L_test_scale),
-                rope_params(1024, 2 * (self.d // 6)),
-                rope_params(1024, 2 * (self.d // 6))
-            ],
-            dim=1
-        ).to(device)
-    def disable_riflex(self):
-        device = self.freqs.device
-        self.freqs = torch.cat(
-            [
-                rope_params(1024, self.d - 4 * (self.d // 6)),
-                rope_params(1024, 2 * (self.d // 6)),
-                rope_params(1024, 2 * (self.d // 6))
-            ],
-            dim=1
-        ).to(device)
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        # For normal model.
-        for block in self.blocks:
-            block.self_attn.forward = types.MethodType(
-                usp_attn_forward, block.self_attn)
-        # For vace model.
-        if hasattr(self, 'vace_blocks'):
-            for block in self.vace_blocks:
-                block.self_attn.forward = types.MethodType(
-                    usp_attn_forward, block.self_attn)
-    @cfg_skip()
-    def forward(
-        self,
-        x,
-        t,
-        context,
-        seq_len,
-        clip_fea=None,
-        y=None,
-        y_camera=None,
-        full_ref=None,
-        subject_ref=None,
-        cond_flag=True,
-    ):
-        r"""
-        Forward pass through the diffusion model
-        Args:
-            x (List[Tensor]):
-                List of input video tensors, each with shape [C_in, F, H, W]
-            t (Tensor):
-                Diffusion timesteps tensor of shape [B]
-            context (List[Tensor]):
-                List of text embeddings each with shape [L, C]
-            seq_len (`int`):
-                Maximum sequence length for positional encoding
-            clip_fea (Tensor, *optional*):
-                CLIP image features for image-to-video mode
-            y (List[Tensor], *optional*):
-                Conditional video inputs for image-to-video mode, same shape as x
-            cond_flag (`bool`, *optional*, defaults to True):
-                Flag to indicate whether to forward the condition input
-        Returns:
-            List[Tensor]:
-                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
-        """
-        # Wan2.2 don't need a clip.
-        # if self.model_type == 'i2v':
-        #     assert clip_fea is not None and y is not None
-        # params
-        device = self.patch_embedding.weight.device
-        dtype = x.dtype
-        if self.freqs.device != device and torch.device(type="meta") != device:
-            self.freqs = self.freqs.to(device)
-        if y is not None:
-            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        # add control adapter
-        if self.control_adapter is not None and y_camera is not None:
-            y_camera = self.control_adapter(y_camera)
-            x = [u + v for u, v in zip(x, y_camera)]
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        if self.ref_conv is not None and full_ref is not None:
-            full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
-            grid_sizes = torch.stack([torch.tensor([u[0] + 1, u[1], u[2]]) for u in grid_sizes]).to(grid_sizes.device)
-            seq_len += full_ref.size(1)
-            x = [torch.concat([_full_ref.unsqueeze(0), u], dim=1) for _full_ref, u in zip(full_ref, x)]
-            if t.dim() != 1 and t.size(1) < seq_len:
-                pad_size = seq_len - t.size(1)
-                last_elements = t[:, -1].unsqueeze(1)
-                padding = last_elements.repeat(1, pad_size)
-                t = torch.cat([padding, t], dim=1)
-        if subject_ref is not None:
-            subject_ref_frames = subject_ref.size(2)
-            subject_ref = self.patch_embedding(subject_ref).flatten(2).transpose(1, 2)
-            grid_sizes = torch.stack([torch.tensor([u[0] + subject_ref_frames, u[1], u[2]]) for u in grid_sizes]).to(grid_sizes.device)
-            seq_len += subject_ref.size(1)
-            x = [torch.concat([u, _subject_ref.unsqueeze(0)], dim=1) for _subject_ref, u in zip(subject_ref, x)]
-            if t.dim() != 1 and t.size(1) < seq_len:
-                pad_size = seq_len - t.size(1)
-                last_elements = t[:, -1].unsqueeze(1)
-                padding = last_elements.repeat(1, pad_size)
-                t = torch.cat([t, padding], dim=1)
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        if self.sp_world_size > 1:
-            seq_len = int(math.ceil(seq_len / self.sp_world_size)) * self.sp_world_size
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in x
-        ])
-        # time embeddings
-        with amp.autocast(dtype=torch.float32):
-            if t.dim() != 1:
-                if t.size(1) < seq_len:
-                    pad_size = seq_len - t.size(1)
-                    last_elements = t[:, -1].unsqueeze(1)
-                    padding = last_elements.repeat(1, pad_size)
-                    t = torch.cat([t, padding], dim=1)
-                bt = t.size(0)
-                ft = t.flatten()
-                e = self.time_embedding(
-                    sinusoidal_embedding_1d(self.freq_dim,
-                                            ft).unflatten(0, (bt, seq_len)).float())
-                e0 = self.time_projection(e).unflatten(2, (6, self.dim))
-            else:
-                e = self.time_embedding(
-                    sinusoidal_embedding_1d(self.freq_dim, t).float())
-                e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-            # assert e.dtype == torch.float32 and e0.dtype == torch.float32
-            # e0 = e0.to(dtype)
-            # e = e.to(dtype)
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        if clip_fea is not None:
-            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-            context = torch.concat([context_clip, context], dim=1)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = torch.chunk(x, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if t.dim() != 1:
-                e0 = torch.chunk(e0, self.sp_world_size, dim=1)[self.sp_world_rank]
-                e = torch.chunk(e, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                if t.dim() != 1:
-                    modulated_inp = e0[:, -1, :]
-                else:
-                    modulated_inp = e0
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                x = x + previous_residual.to(x.device)[-x.size()[0]:,]
-            else:
-                ori_x = x.clone().cpu() if self.teacache.offload else x.clone()
-                for block in self.blocks:
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module):
-                            def custom_forward(*inputs):
-                                return module(*inputs)
-                            return custom_forward
-                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        x = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block),
-                            x,
-                            e0,
-                            seq_lens,
-                            grid_sizes,
-                            self.freqs,
-                            context,
-                            context_lens,
-                            dtype,
-                            t,
-                            **ckpt_kwargs,
-                        )
-                    else:
-                        # arguments
-                        kwargs = dict(
-                            e=e0,
-                            seq_lens=seq_lens,
-                            grid_sizes=grid_sizes,
-                            freqs=self.freqs,
-                            context=context,
-                            context_lens=context_lens,
-                            dtype=dtype,
-                            t=t
-                        )
-                        x = block(x, **kwargs)
-                if cond_flag:
-                    self.teacache.previous_residual_cond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-                else:
-                    self.teacache.previous_residual_uncond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-        else:
-            for block in self.blocks:
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs)
-                        return custom_forward
-                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        x,
-                        e0,
-                        seq_lens,
-                        grid_sizes,
-                        self.freqs,
-                        context,
-                        context_lens,
-                        dtype,
-                        t,
-                        **ckpt_kwargs,
-                    )
-                else:
-                    # arguments
-                    kwargs = dict(
-                        e=e0,
-                        seq_lens=seq_lens,
-                        grid_sizes=grid_sizes,
-                        freqs=self.freqs,
-                        context=context,
-                        context_lens=context_lens,
-                        dtype=dtype,
-                        t=t
-                    )
-                    x = block(x, **kwargs)
-        # head
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.head), x, e, **ckpt_kwargs)
-        else:
-            x = self.head(x, e)
-        if self.sp_world_size > 1:
-            x = self.all_gather(x, dim=1)
-        if self.ref_conv is not None and full_ref is not None:
-            full_ref_length = full_ref.size(1)
-            x = x[:, full_ref_length:]
-            grid_sizes = torch.stack([torch.tensor([u[0] - 1, u[1], u[2]]) for u in grid_sizes]).to(grid_sizes.device)
-        if subject_ref is not None:
-            subject_ref_length = subject_ref.size(1)
-            x = x[:, :-subject_ref_length]
-            grid_sizes = torch.stack([torch.tensor([u[0] - subject_ref_frames, u[1], u[2]]) for u in grid_sizes]).to(grid_sizes.device)
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        x = torch.stack(x)
-        if self.teacache is not None and cond_flag:
-            self.teacache.cnt += 1
-            if self.teacache.cnt == self.teacache.num_steps:
-                self.teacache.reset()
-        return x
-    def unpatchify(self, x, grid_sizes):
-        r"""
-        Reconstruct video tensors from patch embeddings.
-        Args:
-            x (List[Tensor]):
-                List of patchified features, each with shape [L, C_out * prod(patch_size)]
-            grid_sizes (Tensor):
-                Original spatial-temporal grid dimensions before patching,
-                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
-        Returns:
-            List[Tensor]:
-                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
-        """
-        c = self.out_dim
-        out = []
-        for u, v in zip(x, grid_sizes.tolist()):
-            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
-            u = torch.einsum('fhwpqrc->cfphqwr', u)
-            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
-            out.append(u)
-        return out
-    def init_weights(self):
-        r"""
-        Initialize model parameters using Xavier initialization.
-        """
-        # basic init
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-        # init embeddings
-        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
-        for m in self.text_embedding.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, std=.02)
-        for m in self.time_embedding.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, std=.02)
-        # init output layer
-        nn.init.zeros_(self.head.head.weight)
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    print(model_files_safetensors)
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                if model.state_dict()['patch_embedding.weight'].size() != state_dict['patch_embedding.weight'].size():
-                    model.state_dict()['patch_embedding.weight'][:, :state_dict['patch_embedding.weight'].size()[1], :, :] = state_dict['patch_embedding.weight'][:, :model.state_dict()['patch_embedding.weight'].size()[1], :, :]
-                    model.state_dict()['patch_embedding.weight'][:, state_dict['patch_embedding.weight'].size()[1]:, :, :] = 0
-                    state_dict['patch_embedding.weight'] = model.state_dict()['patch_embedding.weight']
-                filtered_state_dict = {}
-                for key in state_dict:
-                    if key in model.state_dict() and model.state_dict()[key].size() == state_dict[key].size():
-                        filtered_state_dict[key] = state_dict[key]
-                    else:
-                        print(f"Skipping key '{key}' due to size mismatch or absence in model.")
-                model_keys = set(model.state_dict().keys())
-                loaded_keys = set(filtered_state_dict.keys())
-                missing_keys = model_keys - loaded_keys
-                def initialize_missing_parameters(missing_keys, model_state_dict, torch_dtype=None):
-                    initialized_dict = {}
-                    with torch.no_grad():
-                        for key in missing_keys:
-                            param_shape = model_state_dict[key].shape
-                            param_dtype = torch_dtype if torch_dtype is not None else model_state_dict[key].dtype
-                            if 'weight' in key:
-                                if any(norm_type in key for norm_type in ['norm', 'ln_', 'layer_norm', 'group_norm', 'batch_norm']):
-                                    initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                                elif 'embedding' in key or 'embed' in key:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                                elif 'head' in key or 'output' in key or 'proj_out' in key:
-                                    initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                elif len(param_shape) >= 2:
-                                    initialized_dict[key] = torch.empty(param_shape, dtype=param_dtype)
-                                    nn.init.xavier_uniform_(initialized_dict[key])
-                                else:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                            elif 'bias' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_mean' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_var' in key:
-                                initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                            elif 'num_batches_tracked' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=torch.long)
-                            else:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                    return initialized_dict
-                if missing_keys:
-                    print(f"Missing keys will be initialized: {sorted(missing_keys)}")
-                    initialized_params = initialize_missing_parameters(
-                        missing_keys,
-                        model.state_dict(),
-                        torch_dtype
-                    )
-                    filtered_state_dict.update(initialized_params)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    model._convert_deprecated_attention_blocks(filtered_state_dict)
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        if model.state_dict()['patch_embedding.weight'].size() != state_dict['patch_embedding.weight'].size():
-            model.state_dict()['patch_embedding.weight'][:, :state_dict['patch_embedding.weight'].size()[1], :, :] = state_dict['patch_embedding.weight'][:, :model.state_dict()['patch_embedding.weight'].size()[1], :, :]
-            model.state_dict()['patch_embedding.weight'][:, state_dict['patch_embedding.weight'].size()[1]:, :, :] = 0
-            state_dict['patch_embedding.weight'] = model.state_dict()['patch_embedding.weight']
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model
-class Wan2_2Transformer3DModel(WanTransformer3DModel):
-    r"""
-    Wan diffusion backbone supporting both text-to-video and image-to-video.
-    """
-    # ignore_for_config = [
-    #     'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
-    # ]
-    # _no_split_modules = ['WanAttentionBlock']
-    _supports_gradient_checkpointing = True
-    def __init__(
-        self,
-        model_type='t2v',
-        patch_size=(1, 2, 2),
-        text_len=512,
-        in_dim=16,
-        dim=2048,
-        ffn_dim=8192,
-        freq_dim=256,
-        text_dim=4096,
-        out_dim=16,
-        num_heads=16,
-        num_layers=32,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=True,
-        eps=1e-6,
-        in_channels=16,
-        hidden_size=2048,
-        add_control_adapter=False,
-        in_dim_control_adapter=24,
-        downscale_factor_control_adapter=8,
-        add_ref_conv=False,
-        in_dim_ref_conv=16,
-    ):
-        r"""
-        Initialize the diffusion model backbone.
-        Args:
-            model_type (`str`, *optional*, defaults to 't2v'):
-                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
-            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
-                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
-            text_len (`int`, *optional*, defaults to 512):
-                Fixed length for text embeddings
-            in_dim (`int`, *optional*, defaults to 16):
-                Input video channels (C_in)
-            dim (`int`, *optional*, defaults to 2048):
-                Hidden dimension of the transformer
-            ffn_dim (`int`, *optional*, defaults to 8192):
-                Intermediate dimension in feed-forward network
-            freq_dim (`int`, *optional*, defaults to 256):
-                Dimension for sinusoidal time embeddings
-            text_dim (`int`, *optional*, defaults to 4096):
-                Input dimension for text embeddings
-            out_dim (`int`, *optional*, defaults to 16):
-                Output video channels (C_out)
-            num_heads (`int`, *optional*, defaults to 16):
-                Number of attention heads
-            num_layers (`int`, *optional*, defaults to 32):
-                Number of transformer blocks
-            window_size (`tuple`, *optional*, defaults to (-1, -1)):
-                Window size for local attention (-1 indicates global attention)
-            qk_norm (`bool`, *optional*, defaults to True):
-                Enable query/key normalization
-            cross_attn_norm (`bool`, *optional*, defaults to False):
-                Enable cross-attention normalization
-            eps (`float`, *optional*, defaults to 1e-6):
-                Epsilon value for normalization layers
-        """
-        super().__init__(
-            model_type=model_type,
-            patch_size=patch_size,
-            text_len=text_len,
-            in_dim=in_dim,
-            dim=dim,
-            ffn_dim=ffn_dim,
-            freq_dim=freq_dim,
-            text_dim=text_dim,
-            out_dim=out_dim,
-            num_heads=num_heads,
-            num_layers=num_layers,
-            window_size=window_size,
-            qk_norm=qk_norm,
-            cross_attn_norm=cross_attn_norm,
-            eps=eps,
-            in_channels=in_channels,
-            hidden_size=hidden_size,
-            add_control_adapter=add_control_adapter,
-            in_dim_control_adapter=in_dim_control_adapter,
-            downscale_factor_control_adapter=downscale_factor_control_adapter,
-            add_ref_conv=add_ref_conv,
-            in_dim_ref_conv=in_dim_ref_conv,
-            cross_attn_type="cross_attn"
-        )
-        if hasattr(self, "img_emb"):
-            del self.img_emb

videox_fun/models/wan_transformer3d_animate.py DELETED Viewed

@@ -1,302 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import types
-from copy import deepcopy
-from typing import List
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import PeftAdapterMixin
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils import is_torch_version, logging
-from einops import rearrange
-from .attention_utils import attention
-from .wan_animate_adapter import FaceAdapter, FaceEncoder
-from .wan_animate_motion_encoder import Generator
-from .wan_transformer3d import (Head, MLPProj, WanAttentionBlock, WanLayerNorm,
-                                WanRMSNorm, WanSelfAttention,
-                                WanTransformer3DModel, rope_apply,
-                                sinusoidal_embedding_1d)
-from ..utils import cfg_skip
-class Wan2_2Transformer3DModel_Animate(WanTransformer3DModel):
-    # _no_split_modules = ['WanAnimateAttentionBlock']
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        patch_size=(1, 2, 2),
-        text_len=512,
-        in_dim=36,
-        dim=5120,
-        ffn_dim=13824,
-        freq_dim=256,
-        text_dim=4096,
-        out_dim=16,
-        num_heads=40,
-        num_layers=40,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=True,
-        eps=1e-6,
-        motion_encoder_dim=512,
-        use_img_emb=True
-    ):
-        model_type = "i2v"   # TODO: Hard code for both preview and official versions.
-        super().__init__(model_type, patch_size, text_len, in_dim, dim, ffn_dim, freq_dim, text_dim, out_dim,
-                         num_heads, num_layers, window_size, qk_norm, cross_attn_norm, eps)
-        self.motion_encoder_dim = motion_encoder_dim
-        self.use_img_emb = use_img_emb
-        self.pose_patch_embedding = nn.Conv3d(
-            16, dim, kernel_size=patch_size, stride=patch_size
-        )
-        # initialize weights
-        self.init_weights()
-        self.motion_encoder = Generator(size=512, style_dim=512, motion_dim=20)
-        self.face_adapter = FaceAdapter(
-            heads_num=self.num_heads,
-            hidden_dim=self.dim,
-            num_adapter_layers=self.num_layers // 5,
-        )
-        self.face_encoder = FaceEncoder(
-            in_dim=motion_encoder_dim,
-            hidden_dim=self.dim,
-            num_heads=4,
-        )
-    def after_patch_embedding(self, x: List[torch.Tensor], pose_latents, face_pixel_values):
-        pose_latents = [self.pose_patch_embedding(u.unsqueeze(0)) for u in pose_latents]
-        for x_, pose_latents_ in zip(x, pose_latents):
-            x_[:, :, 1:] += pose_latents_
-        b,c,T,h,w = face_pixel_values.shape
-        face_pixel_values = rearrange(face_pixel_values, "b c t h w -> (b t) c h w")
-        encode_bs = 8
-        face_pixel_values_tmp = []
-        for i in range(math.ceil(face_pixel_values.shape[0]/encode_bs)):
-            face_pixel_values_tmp.append(self.motion_encoder.get_motion(face_pixel_values[i*encode_bs:(i+1)*encode_bs]))
-        motion_vec = torch.cat(face_pixel_values_tmp)
-        motion_vec = rearrange(motion_vec, "(b t) c -> b t c", t=T)
-        motion_vec = self.face_encoder(motion_vec)
-        B, L, H, C = motion_vec.shape
-        pad_face = torch.zeros(B, 1, H, C).type_as(motion_vec)
-        motion_vec = torch.cat([pad_face, motion_vec], dim=1)
-        return x, motion_vec
-    def after_transformer_block(self, block_idx, x, motion_vec, motion_masks=None):
-        if block_idx % 5 == 0:
-            use_context_parallel = self.sp_world_size > 1
-            adapter_args = [x, motion_vec, motion_masks, use_context_parallel, self.all_gather, self.sp_world_size, self.sp_world_rank]
-            residual_out = self.face_adapter.fuser_blocks[block_idx // 5](*adapter_args)
-            x = residual_out + x
-        return x
-    @cfg_skip()
-    def forward(
-        self,
-        x,
-        t,
-        clip_fea,
-        context,
-        seq_len,
-        y=None,
-        pose_latents=None,
-        face_pixel_values=None,
-        cond_flag=True
-    ):
-        # params
-        device = self.patch_embedding.weight.device
-        dtype = x.dtype
-        if self.freqs.device != device and torch.device(type="meta") != device:
-            self.freqs = self.freqs.to(device)
-        if y is not None:
-            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        x, motion_vec = self.after_patch_embedding(x, pose_latents, face_pixel_values)
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        if self.sp_world_size > 1:
-            seq_len = int(math.ceil(seq_len / self.sp_world_size)) * self.sp_world_size
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in x
-        ])
-        # time embeddings
-        with amp.autocast(dtype=torch.float32):
-            e = self.time_embedding(
-                sinusoidal_embedding_1d(self.freq_dim, t).float()
-            )
-            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-            assert e.dtype == torch.float32 and e0.dtype == torch.float32
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        if self.use_img_emb:
-            context_clip = self.img_emb(clip_fea) # bs x 257 x dim
-            context = torch.concat([context_clip, context], dim=1)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = torch.chunk(x, self.sp_world_size, dim=1)[self.sp_world_rank]
-            if t.dim() != 1:
-                e0 = torch.chunk(e0, self.sp_world_size, dim=1)[self.sp_world_rank]
-                e = torch.chunk(e, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                if t.dim() != 1:
-                    modulated_inp = e0[0][:, -1, :]
-                else:
-                    modulated_inp = e0[0]
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                x = x + previous_residual.to(x.device)[-x.size()[0]:,]
-            else:
-                ori_x = x.clone().cpu() if self.teacache.offload else x.clone()
-                for idx, block in enumerate(self.blocks):
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module):
-                            def custom_forward(*inputs):
-                                return module(*inputs)
-                            return custom_forward
-                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        x = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block),
-                            x,
-                            e0,
-                            seq_lens,
-                            grid_sizes,
-                            self.freqs,
-                            context,
-                            context_lens,
-                            dtype,
-                            t,
-                            **ckpt_kwargs,
-                        )
-                        x, motion_vec = x.to(dtype), motion_vec.to(dtype)
-                        x = self.after_transformer_block(idx, x, motion_vec)
-                    else:
-                        # arguments
-                        kwargs = dict(
-                            e=e0,
-                            seq_lens=seq_lens,
-                            grid_sizes=grid_sizes,
-                            freqs=self.freqs,
-                            context=context,
-                            context_lens=context_lens,
-                            dtype=dtype,
-                            t=t
-                        )
-                        x = block(x, **kwargs)
-                        x, motion_vec = x.to(dtype), motion_vec.to(dtype)
-                        x = self.after_transformer_block(idx, x, motion_vec)
-                if cond_flag:
-                    self.teacache.previous_residual_cond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-                else:
-                    self.teacache.previous_residual_uncond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-        else:
-            for idx, block in enumerate(self.blocks):
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs)
-                        return custom_forward
-                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        x,
-                        e0,
-                        seq_lens,
-                        grid_sizes,
-                        self.freqs,
-                        context,
-                        context_lens,
-                        dtype,
-                        t,
-                        **ckpt_kwargs,
-                    )
-                    x, motion_vec = x.to(dtype), motion_vec.to(dtype)
-                    x = self.after_transformer_block(idx, x, motion_vec)
-                else:
-                    # arguments
-                    kwargs = dict(
-                        e=e0,
-                        seq_lens=seq_lens,
-                        grid_sizes=grid_sizes,
-                        freqs=self.freqs,
-                        context=context,
-                        context_lens=context_lens,
-                        dtype=dtype,
-                        t=t
-                    )
-                    x = block(x, **kwargs)
-                    x, motion_vec = x.to(dtype), motion_vec.to(dtype)
-                    x = self.after_transformer_block(idx, x, motion_vec)
-        # head
-        x = self.head(x, e)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = self.all_gather(x.contiguous(), dim=1)
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        x = torch.stack(x)
-        return x

videox_fun/models/wan_transformer3d_s2v.py DELETED Viewed

@@ -1,932 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/s2v/model_s2v.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import types
-from copy import deepcopy
-from typing import Any, Dict
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-from diffusers.configuration_utils import register_to_config
-from diffusers.utils import is_torch_version
-from einops import rearrange
-from ..dist import (get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group,
-                    usp_attn_s2v_forward)
-from .attention_utils import attention
-from .wan_audio_injector import (AudioInjector_WAN, CausalAudioEncoder,
-                                 FramePackMotioner, MotionerTransformers,
-                                 rope_precompute)
-from .wan_transformer3d import (Wan2_2Transformer3DModel, WanAttentionBlock,
-                                WanLayerNorm, WanSelfAttention,
-                                sinusoidal_embedding_1d)
-from ..utils import cfg_skip
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def torch_dfs(model: nn.Module, parent_name='root'):
-    module_names, modules = [], []
-    current_name = parent_name if parent_name else 'root'
-    module_names.append(current_name)
-    modules.append(model)
-    for name, child in model.named_children():
-        if parent_name:
-            child_name = f'{parent_name}.{name}'
-        else:
-            child_name = name
-        child_modules, child_names = torch_dfs(child, child_name)
-        module_names += child_names
-        modules += child_modules
-    return modules, module_names
-@amp.autocast(enabled=False)
-@torch.compiler.disable()
-def s2v_rope_apply(x, grid_sizes, freqs, start=None):
-    n, c = x.size(2), x.size(3) // 2
-    # loop over samples
-    output = []
-    for i, _ in enumerate(x):
-        s = x.size(1)
-        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(s, n, -1, 2))
-        freqs_i = freqs[i, :s]
-        # apply rotary embedding
-        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
-        x_i = torch.cat([x_i, x[i, s:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output).float()
-def s2v_rope_apply_qk(q, k, grid_sizes, freqs):
-    q = s2v_rope_apply(q, grid_sizes, freqs)
-    k = s2v_rope_apply(k, grid_sizes, freqs)
-    return q, k
-class WanS2VSelfAttention(WanSelfAttention):
-    def forward(self, x, seq_lens, grid_sizes, freqs, dtype=torch.bfloat16, t=0):
-        """
-        Args:
-            x(Tensor): Shape [B, L, num_heads, C / num_heads]
-            seq_lens(Tensor): Shape [B]
-            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        q, k = s2v_rope_apply_qk(q, k, grid_sizes, freqs)
-        x = attention(
-            q.to(dtype),
-            k.to(dtype),
-            v=v.to(dtype),
-            k_lens=seq_lens,
-            window_size=self.window_size)
-        x = x.to(dtype)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class WanS2VAttentionBlock(WanAttentionBlock):
-    def __init__(self,
-                 cross_attn_type,
-                 dim,
-                 ffn_dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=False,
-                 eps=1e-6):
-        super().__init__(
-            cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps
-        )
-        self.self_attn = WanS2VSelfAttention(dim, num_heads, window_size,qk_norm, eps)
-    def forward(self, x, e, seq_lens, grid_sizes, freqs, context, context_lens, dtype=torch.bfloat16, t=0):
-        # e
-        seg_idx = e[1].item()
-        seg_idx = min(max(0, seg_idx), x.size(1))
-        seg_idx = [0, seg_idx, x.size(1)]
-        e = e[0]
-        modulation = self.modulation.unsqueeze(2)
-        e = (modulation + e).chunk(6, dim=1)
-        e = [element.squeeze(1) for element in e]
-        # norm
-        norm_x = self.norm1(x).float()
-        parts = []
-        for i in range(2):
-            parts.append(norm_x[:, seg_idx[i]:seg_idx[i + 1]] *
-                         (1 + e[1][:, i:i + 1]) + e[0][:, i:i + 1])
-        norm_x = torch.cat(parts, dim=1)
-        # self-attention
-        y = self.self_attn(norm_x, seq_lens, grid_sizes, freqs)
-        with amp.autocast(dtype=torch.float32):
-            z = []
-            for i in range(2):
-                z.append(y[:, seg_idx[i]:seg_idx[i + 1]] * e[2][:, i:i + 1])
-            y = torch.cat(z, dim=1)
-            x = x + y
-        # cross-attention & ffn function
-        def cross_attn_ffn(x, context, context_lens, e):
-            x = x + self.cross_attn(self.norm3(x), context, context_lens)
-            norm2_x = self.norm2(x).float()
-            parts = []
-            for i in range(2):
-                parts.append(norm2_x[:, seg_idx[i]:seg_idx[i + 1]] *
-                             (1 + e[4][:, i:i + 1]) + e[3][:, i:i + 1])
-            norm2_x = torch.cat(parts, dim=1)
-            y = self.ffn(norm2_x)
-            with amp.autocast(dtype=torch.float32):
-                z = []
-                for i in range(2):
-                    z.append(y[:, seg_idx[i]:seg_idx[i + 1]] * e[5][:, i:i + 1])
-                y = torch.cat(z, dim=1)
-                x = x + y
-            return x
-        x = cross_attn_ffn(x, context, context_lens, e)
-        return x
-class Wan2_2Transformer3DModel_S2V(Wan2_2Transformer3DModel):
-    # ignore_for_config = [
-    #     'args', 'kwargs', 'patch_size', 'cross_attn_norm', 'qk_norm',
-    #     'text_dim', 'window_size'
-    # ]
-    # _no_split_modules = ['WanS2VAttentionBlock']
-    @register_to_config
-    def __init__(
-        self,
-        cond_dim=0,
-        audio_dim=5120,
-        num_audio_token=4,
-        enable_adain=False,
-        adain_mode="attn_norm",
-        audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27],
-        zero_init=False,
-        zero_timestep=False,
-        enable_motioner=True,
-        add_last_motion=True,
-        enable_tsm=False,
-        trainable_token_pos_emb=False,
-        motion_token_num=1024,
-        enable_framepack=False,  # Mutually exclusive with enable_motioner
-        framepack_drop_mode="drop",
-        model_type='s2v',
-        patch_size=(1, 2, 2),
-        text_len=512,
-        in_dim=16,
-        dim=2048,
-        ffn_dim=8192,
-        freq_dim=256,
-        text_dim=4096,
-        out_dim=16,
-        num_heads=16,
-        num_layers=32,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=True,
-        eps=1e-6,
-        in_channels=16,
-        hidden_size=2048,
-        *args,
-        **kwargs
-    ):
-        super().__init__(
-            model_type=model_type,
-            patch_size=patch_size,
-            text_len=text_len,
-            in_dim=in_dim,
-            dim=dim,
-            ffn_dim=ffn_dim,
-            freq_dim=freq_dim,
-            text_dim=text_dim,
-            out_dim=out_dim,
-            num_heads=num_heads,
-            num_layers=num_layers,
-            window_size=window_size,
-            qk_norm=qk_norm,
-            cross_attn_norm=cross_attn_norm,
-            eps=eps,
-            in_channels=in_channels,
-            hidden_size=hidden_size
-        )
-        assert model_type == 's2v'
-        self.enbale_adain = enable_adain
-        # Whether to assign 0 value timestep to ref/motion
-        self.adain_mode = adain_mode
-        self.zero_timestep = zero_timestep
-        self.enable_motioner = enable_motioner
-        self.add_last_motion = add_last_motion
-        self.enable_framepack = enable_framepack
-        # Replace blocks
-        self.blocks = nn.ModuleList([
-            WanS2VAttentionBlock("cross_attn", dim, ffn_dim, num_heads, window_size, qk_norm,
-                                 cross_attn_norm, eps)
-            for _ in range(num_layers)
-        ])
-        # init audio injector
-        all_modules, all_modules_names = torch_dfs(self.blocks, parent_name="root.transformer_blocks")
-        if cond_dim > 0:
-            self.cond_encoder = nn.Conv3d(
-                cond_dim,
-                self.dim,
-                kernel_size=self.patch_size,
-                stride=self.patch_size)
-        self.trainable_cond_mask = nn.Embedding(3, self.dim)
-        self.casual_audio_encoder = CausalAudioEncoder(
-            dim=audio_dim,
-            out_dim=self.dim,
-            num_token=num_audio_token,
-            need_global=enable_adain)
-        self.audio_injector = AudioInjector_WAN(
-            all_modules,
-            all_modules_names,
-            dim=self.dim,
-            num_heads=self.num_heads,
-            inject_layer=audio_inject_layers,
-            root_net=self,
-            enable_adain=enable_adain,
-            adain_dim=self.dim,
-            need_adain_ont=adain_mode != "attn_norm",
-        )
-        if zero_init:
-            self.zero_init_weights()
-        # init motioner
-        if enable_motioner and enable_framepack:
-            raise ValueError(
-                "enable_motioner and enable_framepack are mutually exclusive, please set one of them to False"
-            )
-        if enable_motioner:
-            motioner_dim = 2048
-            self.motioner = MotionerTransformers(
-                patch_size=(2, 4, 4),
-                dim=motioner_dim,
-                ffn_dim=motioner_dim,
-                freq_dim=256,
-                out_dim=16,
-                num_heads=16,
-                num_layers=13,
-                window_size=(-1, -1),
-                qk_norm=True,
-                cross_attn_norm=False,
-                eps=1e-6,
-                motion_token_num=motion_token_num,
-                enable_tsm=enable_tsm,
-                motion_stride=4,
-                expand_ratio=2,
-                trainable_token_pos_emb=trainable_token_pos_emb,
-            )
-            self.zip_motion_out = torch.nn.Sequential(
-                WanLayerNorm(motioner_dim),
-                zero_module(nn.Linear(motioner_dim, self.dim)))
-            self.trainable_token_pos_emb = trainable_token_pos_emb
-            if trainable_token_pos_emb:
-                d = self.dim // self.num_heads
-                x = torch.zeros([1, motion_token_num, self.num_heads, d])
-                x[..., ::2] = 1
-                gride_sizes = [[
-                    torch.tensor([0, 0, 0]).unsqueeze(0).repeat(1, 1),
-                    torch.tensor([
-                        1, self.motioner.motion_side_len,
-                        self.motioner.motion_side_len
-                    ]).unsqueeze(0).repeat(1, 1),
-                    torch.tensor([
-                        1, self.motioner.motion_side_len,
-                        self.motioner.motion_side_len
-                    ]).unsqueeze(0).repeat(1, 1),
-                ]]
-                token_freqs = s2v_rope_apply(x, gride_sizes, self.freqs)
-                token_freqs = token_freqs[0, :,
-                                          0].reshape(motion_token_num, -1, 2)
-                token_freqs = token_freqs * 0.01
-                self.token_freqs = torch.nn.Parameter(token_freqs)
-        if enable_framepack:
-            self.frame_packer = FramePackMotioner(
-                inner_dim=self.dim,
-                num_heads=self.num_heads,
-                zip_frame_buckets=[1, 2, 16],
-                drop_mode=framepack_drop_mode)
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        for block in self.blocks:
-            block.self_attn.forward = types.MethodType(
-                usp_attn_s2v_forward, block.self_attn)
-    def process_motion(self, motion_latents, drop_motion_frames=False):
-        if drop_motion_frames or motion_latents[0].shape[1] == 0:
-            return [], []
-        self.lat_motion_frames = motion_latents[0].shape[1]
-        mot = [self.patch_embedding(m.unsqueeze(0)) for m in motion_latents]
-        batch_size = len(mot)
-        mot_remb = []
-        flattern_mot = []
-        for bs in range(batch_size):
-            height, width = mot[bs].shape[3], mot[bs].shape[4]
-            flat_mot = mot[bs].flatten(2).transpose(1, 2).contiguous()
-            motion_grid_sizes = [[
-                torch.tensor([-self.lat_motion_frames, 0,
-                              0]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([0, height, width]).unsqueeze(0).repeat(1, 1),
-                torch.tensor([self.lat_motion_frames, height,
-                              width]).unsqueeze(0).repeat(1, 1)
-            ]]
-            motion_rope_emb = rope_precompute(
-                flat_mot.detach().view(1, flat_mot.shape[1], self.num_heads,
-                                       self.dim // self.num_heads),
-                motion_grid_sizes,
-                self.freqs,
-                start=None)
-            mot_remb.append(motion_rope_emb)
-            flattern_mot.append(flat_mot)
-        return flattern_mot, mot_remb
-    def process_motion_frame_pack(self,
-                                  motion_latents,
-                                  drop_motion_frames=False,
-                                  add_last_motion=2):
-        flattern_mot, mot_remb = self.frame_packer(motion_latents,
-                                                   add_last_motion)
-        if drop_motion_frames:
-            return [m[:, :0] for m in flattern_mot
-                   ], [m[:, :0] for m in mot_remb]
-        else:
-            return flattern_mot, mot_remb
-    def process_motion_transformer_motioner(self,
-                                            motion_latents,
-                                            drop_motion_frames=False,
-                                            add_last_motion=True):
-        batch_size, height, width = len(
-            motion_latents), motion_latents[0].shape[2] // self.patch_size[
-                1], motion_latents[0].shape[3] // self.patch_size[2]
-        freqs = self.freqs
-        device = self.patch_embedding.weight.device
-        if freqs.device != device:
-            freqs = freqs.to(device)
-        if self.trainable_token_pos_emb:
-            with amp.autocast(dtype=torch.float64):
-                token_freqs = self.token_freqs.to(torch.float64)
-                token_freqs = token_freqs / token_freqs.norm(
-                    dim=-1, keepdim=True)
-                freqs = [freqs, torch.view_as_complex(token_freqs)]
-        if not drop_motion_frames and add_last_motion:
-            last_motion_latent = [u[:, -1:] for u in motion_latents]
-            last_mot = [
-                self.patch_embedding(m.unsqueeze(0)) for m in last_motion_latent
-            ]
-            last_mot = [m.flatten(2).transpose(1, 2) for m in last_mot]
-            last_mot = torch.cat(last_mot)
-            gride_sizes = [[
-                torch.tensor([-1, 0, 0]).unsqueeze(0).repeat(batch_size, 1),
-                torch.tensor([0, height,
-                              width]).unsqueeze(0).repeat(batch_size, 1),
-                torch.tensor([1, height,
-                              width]).unsqueeze(0).repeat(batch_size, 1)
-            ]]
-        else:
-            last_mot = torch.zeros([batch_size, 0, self.dim],
-                                   device=motion_latents[0].device,
-                                   dtype=motion_latents[0].dtype)
-            gride_sizes = []
-        zip_motion = self.motioner(motion_latents)
-        zip_motion = self.zip_motion_out(zip_motion)
-        if drop_motion_frames:
-            zip_motion = zip_motion * 0.0
-        zip_motion_grid_sizes = [[
-            torch.tensor([-1, 0, 0]).unsqueeze(0).repeat(batch_size, 1),
-            torch.tensor([
-                0, self.motioner.motion_side_len, self.motioner.motion_side_len
-            ]).unsqueeze(0).repeat(batch_size, 1),
-            torch.tensor(
-                [1 if not self.trainable_token_pos_emb else -1, height,
-                 width]).unsqueeze(0).repeat(batch_size, 1),
-        ]]
-        mot = torch.cat([last_mot, zip_motion], dim=1)
-        gride_sizes = gride_sizes + zip_motion_grid_sizes
-        motion_rope_emb = rope_precompute(
-            mot.detach().view(batch_size, mot.shape[1], self.num_heads,
-                              self.dim // self.num_heads),
-            gride_sizes,
-            freqs,
-            start=None)
-        return [m.unsqueeze(0) for m in mot
-               ], [r.unsqueeze(0) for r in motion_rope_emb]
-    def inject_motion(self,
-                      x,
-                      seq_lens,
-                      rope_embs,
-                      mask_input,
-                      motion_latents,
-                      drop_motion_frames=False,
-                      add_last_motion=True):
-        # Inject the motion frames token to the hidden states
-        if self.enable_motioner:
-            mot, mot_remb = self.process_motion_transformer_motioner(
-                motion_latents,
-                drop_motion_frames=drop_motion_frames,
-                add_last_motion=add_last_motion)
-        elif self.enable_framepack:
-            mot, mot_remb = self.process_motion_frame_pack(
-                motion_latents,
-                drop_motion_frames=drop_motion_frames,
-                add_last_motion=add_last_motion)
-        else:
-            mot, mot_remb = self.process_motion(
-                motion_latents, drop_motion_frames=drop_motion_frames)
-        if len(mot) > 0:
-            x = [torch.cat([u, m], dim=1) for u, m in zip(x, mot)]
-            seq_lens = seq_lens + torch.tensor([r.size(1) for r in mot],
-                                               dtype=torch.long)
-            rope_embs = [
-                torch.cat([u, m], dim=1) for u, m in zip(rope_embs, mot_remb)
-            ]
-            mask_input = [
-                torch.cat([
-                    m, 2 * torch.ones([1, u.shape[1] - m.shape[1]],
-                                      device=m.device,
-                                      dtype=m.dtype)
-                ],
-                          dim=1) for m, u in zip(mask_input, x)
-            ]
-        return x, seq_lens, rope_embs, mask_input
-    def after_transformer_block(self, block_idx, hidden_states):
-        if block_idx in self.audio_injector.injected_block_id.keys():
-            audio_attn_id = self.audio_injector.injected_block_id[block_idx]
-            audio_emb = self.merged_audio_emb  # b f n c
-            num_frames = audio_emb.shape[1]
-            if self.sp_world_size > 1:
-                hidden_states = self.all_gather(hidden_states, dim=1)
-            input_hidden_states = hidden_states[:, :self.original_seq_len].clone()
-            input_hidden_states = rearrange(
-                input_hidden_states, "b (t n) c -> (b t) n c", t=num_frames)
-            if self.enbale_adain and self.adain_mode == "attn_norm":
-                audio_emb_global = self.audio_emb_global
-                audio_emb_global = rearrange(audio_emb_global,
-                                             "b t n c -> (b t) n c")
-                adain_hidden_states = self.audio_injector.injector_adain_layers[audio_attn_id](
-                    input_hidden_states, temb=audio_emb_global[:, 0]
-                )
-                attn_hidden_states = adain_hidden_states
-            else:
-                attn_hidden_states = self.audio_injector.injector_pre_norm_feat[audio_attn_id](
-                    input_hidden_states
-                )
-            audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
-            attn_audio_emb = audio_emb
-            context_lens = torch.ones(
-                attn_hidden_states.shape[0], dtype=torch.long, device=attn_hidden_states.device
-            ) * attn_audio_emb.shape[1]
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                residual_out = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.audio_injector.injector[audio_attn_id]),
-                    attn_hidden_states,
-                    attn_audio_emb,
-                    context_lens,
-                    **ckpt_kwargs
-                )
-            else:
-                residual_out = self.audio_injector.injector[audio_attn_id](
-                    x=attn_hidden_states,
-                    context=attn_audio_emb,
-                    context_lens=context_lens)
-            residual_out = rearrange(residual_out, "(b t) n c -> b (t n) c", t=num_frames)
-            hidden_states[:, :self.original_seq_len] = hidden_states[:, :self.original_seq_len] + residual_out
-            if self.sp_world_size > 1:
-                hidden_states = torch.chunk(
-                    hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
-        return hidden_states
-    @cfg_skip()
-    def forward(
-        self,
-        x,
-        t,
-        context,
-        seq_len,
-        ref_latents,
-        motion_latents,
-        cond_states,
-        audio_input=None,
-        motion_frames=[17, 5],
-        add_last_motion=2,
-        drop_motion_frames=False,
-        cond_flag=True,
-        *extra_args,
-        **extra_kwargs
-    ):
-        """
-        x:                  A list of videos each with shape [C, T, H, W].
-        t:                  [B].
-        context:            A list of text embeddings each with shape [L, C].
-        seq_len:            A list of video token lens, no need for this model.
-        ref_latents         A list of reference image for each video with shape [C, 1, H, W].
-        motion_latents      A list of  motion frames for each video with shape [C, T_m, H, W].
-        cond_states         A list of condition frames (i.e. pose) each with shape [C, T, H, W].
-        audio_input         The input audio embedding [B, num_wav2vec_layer, C_a, T_a].
-        motion_frames       The number of motion frames and motion latents frames encoded by vae, i.e. [17, 5]
-        add_last_motion     For the motioner, if add_last_motion > 0, it means that the most recent frame (i.e., the last frame) will be added.
-                            For frame packing, the behavior depends on the value of add_last_motion:
-                            add_last_motion = 0: Only the farthest part of the latent (i.e., clean_latents_4x) is included.
-                            add_last_motion = 1: Both clean_latents_2x and clean_latents_4x are included.
-                            add_last_motion = 2: All motion-related latents are used.
-        drop_motion_frames  Bool, whether drop the motion frames info
-        """
-        device = self.patch_embedding.weight.device
-        dtype = x.dtype
-        if self.freqs.device != device and torch.device(type="meta") != device:
-            self.freqs = self.freqs.to(device)
-        add_last_motion = self.add_last_motion * add_last_motion
-        # Embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        if isinstance(motion_frames[0], list):
-            motion_frames_0 = motion_frames[0][0]
-            motion_frames_1 = motion_frames[0][1]
-        else:
-            motion_frames_0 = motion_frames[0]
-            motion_frames_1 = motion_frames[1]
-        # Audio process
-        audio_input = torch.cat([audio_input[..., 0:1].repeat(1, 1, 1, motion_frames_0), audio_input], dim=-1)
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            audio_emb_res = torch.utils.checkpoint.checkpoint(create_custom_forward(self.casual_audio_encoder), audio_input, **ckpt_kwargs)
-        else:
-            audio_emb_res = self.casual_audio_encoder(audio_input)
-        if self.enbale_adain:
-            audio_emb_global, audio_emb = audio_emb_res
-            self.audio_emb_global = audio_emb_global[:, motion_frames_1:].clone()
-        else:
-            audio_emb = audio_emb_res
-        self.merged_audio_emb = audio_emb[:, motion_frames_1:, :]
-        # Cond states
-        cond = [self.cond_encoder(c.unsqueeze(0)) for c in cond_states]
-        x = [x_ + pose for x_, pose in zip(x, cond)]
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        original_grid_sizes = deepcopy(grid_sizes)
-        grid_sizes = [[torch.zeros_like(grid_sizes), grid_sizes, grid_sizes]]
-        # Ref latents
-        ref = [self.patch_embedding(r.unsqueeze(0)) for r in ref_latents]
-        batch_size = len(ref)
-        height, width = ref[0].shape[3], ref[0].shape[4]
-        ref = [r.flatten(2).transpose(1, 2) for r in ref]  # r: 1 c f h w
-        x = [torch.cat([u, r], dim=1) for u, r in zip(x, ref)]
-        self.original_seq_len = seq_lens[0]
-        seq_lens = seq_lens + torch.tensor([r.size(1) for r in ref], dtype=torch.long)
-        ref_grid_sizes = [
-            [
-                torch.tensor([30, 0, 0]).unsqueeze(0).repeat(batch_size, 1),  # the start index
-                torch.tensor([31, height,width]).unsqueeze(0).repeat(batch_size, 1),  # the end index
-                torch.tensor([1, height, width]).unsqueeze(0).repeat(batch_size, 1),
-            ]  # the range
-        ]
-        grid_sizes = grid_sizes + ref_grid_sizes
-        # Compute the rope embeddings for the input
-        x = torch.cat(x)
-        b, s, n, d = x.size(0), x.size(1), self.num_heads, self.dim // self.num_heads
-        self.pre_compute_freqs = rope_precompute(
-            x.detach().view(b, s, n, d), grid_sizes, self.freqs, start=None)
-        x = [u.unsqueeze(0) for u in x]
-        self.pre_compute_freqs = [u.unsqueeze(0) for u in self.pre_compute_freqs]
-        # Inject Motion latents.
-        # Initialize masks to indicate noisy latent, ref latent, and motion latent.
-        # However, at this point, only the first two (noisy and ref latents) are marked;
-        # the marking of motion latent will be implemented inside `inject_motion`.
-        mask_input = [
-            torch.zeros([1, u.shape[1]], dtype=torch.long, device=x[0].device)
-            for u in x
-        ]
-        for i in range(len(mask_input)):
-            mask_input[i][:, self.original_seq_len:] = 1
-        self.lat_motion_frames = motion_latents[0].shape[1]
-        x, seq_lens, self.pre_compute_freqs, mask_input = self.inject_motion(
-            x,
-            seq_lens,
-            self.pre_compute_freqs,
-            mask_input,
-            motion_latents,
-            drop_motion_frames=drop_motion_frames,
-            add_last_motion=add_last_motion)
-        x = torch.cat(x, dim=0)
-        self.pre_compute_freqs = torch.cat(self.pre_compute_freqs, dim=0)
-        mask_input = torch.cat(mask_input, dim=0)
-        # Apply trainable_cond_mask
-        x = x + self.trainable_cond_mask(mask_input).to(x.dtype)
-        seq_len = seq_lens.max()
-        if self.sp_world_size > 1:
-            seq_len = int(math.ceil(seq_len / self.sp_world_size)) * self.sp_world_size
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u.unsqueeze(0), u.new_zeros(1, seq_len - u.size(0), u.size(1))],
-                      dim=1) for u in x
-        ])
-        # Time embeddings
-        if self.zero_timestep:
-            t = torch.cat([t, torch.zeros([1], dtype=t.dtype, device=t.device)])
-        with amp.autocast(dtype=torch.float32):
-            e = self.time_embedding(
-                sinusoidal_embedding_1d(self.freq_dim, t).float())
-            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-            assert e.dtype == torch.float32 and e0.dtype == torch.float32
-        if self.zero_timestep:
-            e = e[:-1]
-            zero_e0 = e0[-1:]
-            e0 = e0[:-1]
-            token_len = x.shape[1]
-            e0 = torch.cat(
-                [
-                    e0.unsqueeze(2),
-                    zero_e0.unsqueeze(2).repeat(e0.size(0), 1, 1, 1)
-                ],
-                dim=2
-            )
-            e0 = [e0, self.original_seq_len]
-        else:
-            e0 = e0.unsqueeze(2).repeat(1, 1, 2, 1)
-            e0 = [e0, 0]
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        if self.sp_world_size > 1:
-            # Sharded tensors for long context attn
-            x = torch.chunk(x, self.sp_world_size, dim=1)
-            sq_size = [u.shape[1] for u in x]
-            sq_start_size = sum(sq_size[:self.sp_world_rank])
-            x = x[self.sp_world_rank]
-            # Confirm the application range of the time embedding in e0[0] for each sequence:
-            # - For tokens before seg_id: apply e0[0][:, :, 0]
-            # - For tokens after seg_id: apply e0[0][:, :, 1]
-            sp_size = x.shape[1]
-            seg_idx = e0[1] - sq_start_size
-            e0[1] = seg_idx
-            self.pre_compute_freqs = torch.chunk(self.pre_compute_freqs, self.sp_world_size, dim=1)
-            self.pre_compute_freqs = self.pre_compute_freqs[self.sp_world_rank]
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                if t.dim() != 1:
-                    modulated_inp = e0[0][:, -1, :]
-                else:
-                    modulated_inp = e0[0]
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                x = x + previous_residual.to(x.device)[-x.size()[0]:,]
-            else:
-                ori_x = x.clone().cpu() if self.teacache.offload else x.clone()
-                for idx, block in enumerate(self.blocks):
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module):
-                            def custom_forward(*inputs):
-                                return module(*inputs)
-                            return custom_forward
-                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        x = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block),
-                            x,
-                            e0,
-                            seq_lens,
-                            grid_sizes,
-                            self.pre_compute_freqs,
-                            context,
-                            context_lens,
-                            dtype,
-                            t,
-                            **ckpt_kwargs,
-                        )
-                        x = self.after_transformer_block(idx, x)
-                    else:
-                        # arguments
-                        kwargs = dict(
-                            e=e0,
-                            seq_lens=seq_lens,
-                            grid_sizes=grid_sizes,
-                            freqs=self.pre_compute_freqs,
-                            context=context,
-                            context_lens=context_lens,
-                            dtype=dtype,
-                            t=t
-                        )
-                        x = block(x, **kwargs)
-                        x = self.after_transformer_block(idx, x)
-                if cond_flag:
-                    self.teacache.previous_residual_cond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-                else:
-                    self.teacache.previous_residual_uncond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-        else:
-            for idx, block in enumerate(self.blocks):
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module):
-                        def custom_forward(*inputs):
-                            return module(*inputs)
-                        return custom_forward
-                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block),
-                        x,
-                        e0,
-                        seq_lens,
-                        grid_sizes,
-                        self.pre_compute_freqs,
-                        context,
-                        context_lens,
-                        dtype,
-                        t,
-                        **ckpt_kwargs,
-                    )
-                    x = self.after_transformer_block(idx, x)
-                else:
-                    # arguments
-                    kwargs = dict(
-                        e=e0,
-                        seq_lens=seq_lens,
-                        grid_sizes=grid_sizes,
-                        freqs=self.pre_compute_freqs,
-                        context=context,
-                        context_lens=context_lens,
-                        dtype=dtype,
-                        t=t
-                    )
-                    x = block(x, **kwargs)
-                    x = self.after_transformer_block(idx, x)
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = self.all_gather(x.contiguous(), dim=1)
-        # Unpatchify
-        x = x[:, :self.original_seq_len]
-        # head
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.head), x, e, **ckpt_kwargs)
-        else:
-            x = self.head(x, e)
-        x = self.unpatchify(x, original_grid_sizes)
-        x = torch.stack(x)
-        if self.teacache is not None and cond_flag:
-            self.teacache.cnt += 1
-            if self.teacache.cnt == self.teacache.num_steps:
-                self.teacache.reset()
-        return x
-    def unpatchify(self, x, grid_sizes):
-        """
-        Reconstruct video tensors from patch embeddings.
-        Args:
-            x (List[Tensor]):
-                List of patchified features, each with shape [L, C_out * prod(patch_size)]
-            grid_sizes (Tensor):
-                Original spatial-temporal grid dimensions before patching,
-                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
-        Returns:
-            List[Tensor]:
-                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
-        """
-        c = self.out_dim
-        out = []
-        for u, v in zip(x, grid_sizes.tolist()):
-            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
-            u = torch.einsum('fhwpqrc->cfphqwr', u)
-            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
-            out.append(u)
-        return out
-    def zero_init_weights(self):
-        with torch.no_grad():
-            self.trainable_cond_mask = zero_module(self.trainable_cond_mask)
-            if hasattr(self, "cond_encoder"):
-                self.cond_encoder = zero_module(self.cond_encoder)
-            for i in range(self.audio_injector.injector.__len__()):
-                self.audio_injector.injector[i].o = zero_module(
-                    self.audio_injector.injector[i].o)
-                if self.enbale_adain:
-                    self.audio_injector.injector_adain_layers[i].linear = \
-                        zero_module(self.audio_injector.injector_adain_layers[i].linear)

videox_fun/models/wan_transformer3d_vace.py DELETED Viewed

@@ -1,394 +0,0 @@
-# Modified from https://github.com/ali-vilab/VACE/blob/main/vace/models/wan/wan_vace.py
-# -*- coding: utf-8 -*-
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict
-import os
-import math
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-from diffusers.configuration_utils import register_to_config
-from diffusers.utils import is_torch_version
-from .wan_transformer3d import (WanAttentionBlock, WanTransformer3DModel,
-                                sinusoidal_embedding_1d)
-from ..utils import cfg_skip
-VIDEOX_OFFLOAD_VACE_LATENTS = os.environ.get("VIDEOX_OFFLOAD_VACE_LATENTS", False)
-class VaceWanAttentionBlock(WanAttentionBlock):
-    def __init__(
-            self,
-            cross_attn_type,
-            dim,
-            ffn_dim,
-            num_heads,
-            window_size=(-1, -1),
-            qk_norm=True,
-            cross_attn_norm=False,
-            eps=1e-6,
-            block_id=0
-    ):
-        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps)
-        self.block_id = block_id
-        if block_id == 0:
-            self.before_proj = nn.Linear(self.dim, self.dim)
-            nn.init.zeros_(self.before_proj.weight)
-            nn.init.zeros_(self.before_proj.bias)
-        self.after_proj = nn.Linear(self.dim, self.dim)
-        nn.init.zeros_(self.after_proj.weight)
-        nn.init.zeros_(self.after_proj.bias)
-    def forward(self, c, x, **kwargs):
-        if self.block_id == 0:
-            c = self.before_proj(c) + x
-            all_c = []
-        else:
-            all_c = list(torch.unbind(c))
-            c = all_c.pop(-1)
-        if VIDEOX_OFFLOAD_VACE_LATENTS:
-            c = c.to(x.device)
-        c = super().forward(c, **kwargs)
-        c_skip = self.after_proj(c)
-        if VIDEOX_OFFLOAD_VACE_LATENTS:
-            c_skip = c_skip.to("cpu")
-            c = c.to("cpu")
-        all_c += [c_skip, c]
-        c = torch.stack(all_c)
-        return c
-class BaseWanAttentionBlock(WanAttentionBlock):
-    def __init__(
-        self,
-        cross_attn_type,
-        dim,
-        ffn_dim,
-        num_heads,
-        window_size=(-1, -1),
-        qk_norm=True,
-        cross_attn_norm=False,
-        eps=1e-6,
-        block_id=None
-    ):
-        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps)
-        self.block_id = block_id
-    def forward(self, x, hints, context_scale=1.0, **kwargs):
-        x = super().forward(x, **kwargs)
-        if self.block_id is not None:
-            if VIDEOX_OFFLOAD_VACE_LATENTS:
-                x = x + hints[self.block_id].to(x.device) * context_scale
-            else:
-                x = x + hints[self.block_id] * context_scale
-        return x
-class VaceWanTransformer3DModel(WanTransformer3DModel):
-    @register_to_config
-    def __init__(self,
-                 vace_layers=None,
-                 vace_in_dim=None,
-                 model_type='t2v',
-                 patch_size=(1, 2, 2),
-                 text_len=512,
-                 in_dim=16,
-                 dim=2048,
-                 ffn_dim=8192,
-                 freq_dim=256,
-                 text_dim=4096,
-                 out_dim=16,
-                 num_heads=16,
-                 num_layers=32,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=True,
-                 eps=1e-6):
-        model_type = "t2v"   # TODO: Hard code for both preview and official versions.
-        super().__init__(model_type, patch_size, text_len, in_dim, dim, ffn_dim, freq_dim, text_dim, out_dim,
-                         num_heads, num_layers, window_size, qk_norm, cross_attn_norm, eps)
-        self.vace_layers = [i for i in range(0, self.num_layers, 2)] if vace_layers is None else vace_layers
-        self.vace_in_dim = self.in_dim if vace_in_dim is None else vace_in_dim
-        assert 0 in self.vace_layers
-        self.vace_layers_mapping = {i: n for n, i in enumerate(self.vace_layers)}
-        # blocks
-        self.blocks = nn.ModuleList([
-            BaseWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm,
-                                  self.cross_attn_norm, self.eps,
-                                  block_id=self.vace_layers_mapping[i] if i in self.vace_layers else None)
-            for i in range(self.num_layers)
-        ])
-        # vace blocks
-        self.vace_blocks = nn.ModuleList([
-            VaceWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm,
-                                     self.cross_attn_norm, self.eps, block_id=i)
-            for i in self.vace_layers
-        ])
-        # vace patch embeddings
-        self.vace_patch_embedding = nn.Conv3d(
-            self.vace_in_dim, self.dim, kernel_size=self.patch_size, stride=self.patch_size
-        )
-    def forward_vace(
-        self,
-        x,
-        vace_context,
-        seq_len,
-        kwargs
-    ):
-        # embeddings
-        c = [self.vace_patch_embedding(u.unsqueeze(0)) for u in vace_context]
-        c = [u.flatten(2).transpose(1, 2) for u in c]
-        c = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in c
-        ])
-        # Context Parallel
-        if self.sp_world_size > 1:
-            c = torch.chunk(c, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # arguments
-        new_kwargs = dict(x=x)
-        new_kwargs.update(kwargs)
-        for block in self.vace_blocks:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                def create_custom_forward(module, **static_kwargs):
-                    def custom_forward(*inputs):
-                        return module(*inputs, **static_kwargs)
-                    return custom_forward
-                ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                c = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block, **new_kwargs),
-                    c,
-                    **ckpt_kwargs,
-                )
-            else:
-                c = block(c, **new_kwargs)
-        hints = torch.unbind(c)[:-1]
-        return hints
-    @cfg_skip()
-    def forward(
-        self,
-        x,
-        t,
-        vace_context,
-        context,
-        seq_len,
-        vace_context_scale=1.0,
-        clip_fea=None,
-        y=None,
-        cond_flag=True
-    ):
-        r"""
-        Forward pass through the diffusion model
-        Args:
-            x (List[Tensor]):
-                List of input video tensors, each with shape [C_in, F, H, W]
-            t (Tensor):
-                Diffusion timesteps tensor of shape [B]
-            context (List[Tensor]):
-                List of text embeddings each with shape [L, C]
-            seq_len (`int`):
-                Maximum sequence length for positional encoding
-            clip_fea (Tensor, *optional*):
-                CLIP image features for image-to-video mode
-            y (List[Tensor], *optional*):
-                Conditional video inputs for image-to-video mode, same shape as x
-        Returns:
-            List[Tensor]:
-                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
-        """
-        # if self.model_type == 'i2v':
-        #     assert clip_fea is not None and y is not None
-        # params
-        device = self.patch_embedding.weight.device
-        dtype = x.dtype
-        if self.freqs.device != device and torch.device(type="meta") != device:
-            self.freqs = self.freqs.to(device)
-        # if y is not None:
-        #     x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        if self.sp_world_size > 1:
-            seq_len = int(math.ceil(seq_len / self.sp_world_size)) * self.sp_world_size
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in x
-        ])
-        # time embeddings
-        with amp.autocast(dtype=torch.float32):
-            e = self.time_embedding(
-                sinusoidal_embedding_1d(self.freq_dim, t).float())
-            e0 = self.time_projection(e).unflatten(1, (6, self.dim))
-            assert e.dtype == torch.float32 and e0.dtype == torch.float32
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = torch.chunk(x, self.sp_world_size, dim=1)[self.sp_world_rank]
-        # arguments
-        kwargs = dict(
-            e=e0,
-            seq_lens=seq_lens,
-            grid_sizes=grid_sizes,
-            freqs=self.freqs,
-            context=context,
-            context_lens=context_lens,
-            dtype=dtype,
-            t=t)
-        hints = self.forward_vace(x, vace_context, seq_len, kwargs)
-        kwargs['hints'] = hints
-        kwargs['context_scale'] = vace_context_scale
-        # TeaCache
-        if self.teacache is not None:
-            if cond_flag:
-                if t.dim() != 1:
-                    modulated_inp = e0[:, -1, :]
-                else:
-                    modulated_inp = e0
-                skip_flag = self.teacache.cnt < self.teacache.num_skip_start_steps
-                if skip_flag:
-                    self.should_calc = True
-                    self.teacache.accumulated_rel_l1_distance = 0
-                else:
-                    if cond_flag:
-                        rel_l1_distance = self.teacache.compute_rel_l1_distance(self.teacache.previous_modulated_input, modulated_inp)
-                        self.teacache.accumulated_rel_l1_distance += self.teacache.rescale_func(rel_l1_distance)
-                    if self.teacache.accumulated_rel_l1_distance < self.teacache.rel_l1_thresh:
-                        self.should_calc = False
-                    else:
-                        self.should_calc = True
-                        self.teacache.accumulated_rel_l1_distance = 0
-                self.teacache.previous_modulated_input = modulated_inp
-                self.teacache.should_calc = self.should_calc
-            else:
-                self.should_calc = self.teacache.should_calc
-        # TeaCache
-        if self.teacache is not None:
-            if not self.should_calc:
-                previous_residual = self.teacache.previous_residual_cond if cond_flag else self.teacache.previous_residual_uncond
-                x = x + previous_residual.to(x.device)[-x.size()[0]:,]
-            else:
-                ori_x = x.clone().cpu() if self.teacache.offload else x.clone()
-                for block in self.blocks:
-                    if torch.is_grad_enabled() and self.gradient_checkpointing:
-                        def create_custom_forward(module, **static_kwargs):
-                            def custom_forward(*inputs):
-                                return module(*inputs, **static_kwargs)
-                            return custom_forward
-                        extra_kwargs = {
-                            'e': e0,
-                            'seq_lens': seq_lens,
-                            'grid_sizes': grid_sizes,
-                            'freqs': self.freqs,
-                            'context': context,
-                            'context_lens': context_lens,
-                            'dtype': dtype,
-                            't': t,
-                        }
-                        ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                        x = torch.utils.checkpoint.checkpoint(
-                            create_custom_forward(block, **extra_kwargs),
-                            x,
-                            hints,
-                            vace_context_scale,
-                            **ckpt_kwargs,
-                        )
-                    else:
-                        x = block(x, **kwargs)
-                if cond_flag:
-                    self.teacache.previous_residual_cond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-                else:
-                    self.teacache.previous_residual_uncond = x.cpu() - ori_x if self.teacache.offload else x - ori_x
-        else:
-            for block in self.blocks:
-                if torch.is_grad_enabled() and self.gradient_checkpointing:
-                    def create_custom_forward(module, **static_kwargs):
-                        def custom_forward(*inputs):
-                            return module(*inputs, **static_kwargs)
-                        return custom_forward
-                    extra_kwargs = {
-                        'e': e0,
-                        'seq_lens': seq_lens,
-                        'grid_sizes': grid_sizes,
-                        'freqs': self.freqs,
-                        'context': context,
-                        'context_lens': context_lens,
-                        'dtype': dtype,
-                        't': t,
-                    }
-                    ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                    x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(block, **extra_kwargs),
-                        x,
-                        hints,
-                        vace_context_scale,
-                        **ckpt_kwargs,
-                    )
-                else:
-                    x = block(x, **kwargs)
-        # head
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-                return custom_forward
-            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.head), x, e, **ckpt_kwargs)
-        else:
-            x = self.head(x, e)
-        if self.sp_world_size > 1:
-            x = self.all_gather(x, dim=1)
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        x = torch.stack(x)
-        if self.teacache is not None and cond_flag:
-            self.teacache.cnt += 1
-            if self.teacache.cnt == self.teacache.num_steps:
-                self.teacache.reset()
-        return x

videox_fun/models/wan_vae.py DELETED Viewed

@@ -1,860 +0,0 @@
-# Modified from https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/vae.py
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from typing import Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.autoencoders.vae import (DecoderOutput,
-                                               DiagonalGaussianDistribution)
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from einops import rearrange
-CACHE_T = 2
-class CausalConv3d(nn.Conv3d):
-    """
-    Causal 3d convolusion.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._padding = (self.padding[2], self.padding[2], self.padding[1],
-                         self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-        return super().forward(x)
-class RMS_norm(nn.Module):
-    def __init__(self, dim, channel_first=True, images=True, bias=False):
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
-    def forward(self, x):
-        return F.normalize(
-            x, dim=(1 if self.channel_first else
-                    -1)) * self.scale * self.gamma + self.bias
-class Upsample(nn.Upsample):
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-class Resample(nn.Module):
-    def __init__(self, dim, mode):
-        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
-                        'downsample3d')
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-        # layers
-        if mode == 'upsample2d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                nn.Conv2d(dim, dim // 2, 3, padding=1))
-        elif mode == 'upsample3d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                nn.Conv2d(dim, dim // 2, 3, padding=1))
-            self.time_conv = CausalConv3d(
-                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-        elif mode == 'downsample2d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == 'downsample3d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = CausalConv3d(
-                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        else:
-            self.resample = nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == 'upsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = 'Rep'
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] != 'Rep':
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat([
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device), cache_x
-                        ],
-                                            dim=2)
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] == 'Rep':
-                        cache_x = torch.cat([
-                            torch.zeros_like(cache_x).to(cache_x.device),
-                            cache_x
-                        ],
-                                            dim=2)
-                    if feat_cache[idx] == 'Rep':
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
-                                    3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.resample(x)
-        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
-        if self.mode == 'downsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
-                    #     # cache last frame of last two chunk
-                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-                    x = self.time_conv(
-                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-    def init_weight(self, conv):
-        conv_weight = conv.weight
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        one_matrix = torch.eye(c1, c2)
-        init_matrix = one_matrix
-        nn.init.zeros_(conv_weight)
-        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
-        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-    def init_weight2(self, conv):
-        conv_weight = conv.weight.data
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        init_matrix = torch.eye(c1 // 2, c2)
-        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
-        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
-        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-class ResidualBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, dropout=0.0):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # layers
-        self.residual = nn.Sequential(
-            RMS_norm(in_dim, images=False), nn.SiLU(),
-            CausalConv3d(in_dim, out_dim, 3, padding=1),
-            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
-            CausalConv3d(out_dim, out_dim, 3, padding=1))
-        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
-            if in_dim != out_dim else nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
-        for layer in self.residual:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x + h
-class AttentionBlock(nn.Module):
-    """
-    Causal self-attention with a single head.
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        # layers
-        self.norm = RMS_norm(dim)
-        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
-        self.proj = nn.Conv2d(dim, dim, 1)
-        # zero out the last layer params
-        nn.init.zeros_(self.proj.weight)
-    def forward(self, x):
-        identity = x
-        b, c, t, h, w = x.size()
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.norm(x)
-        # compute query, key, value
-        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3,
-                                         -1).permute(0, 1, 3,
-                                                     2).contiguous().chunk(
-                                                         3, dim=-1)
-        # apply attention
-        x = F.scaled_dot_product_attention(
-            q,
-            k,
-            v,
-        )
-        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
-        # output
-        x = self.proj(x)
-        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
-        return x + identity
-class Encoder3d(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-        # init block
-        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
-        # downsample blocks
-        downsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            for _ in range(num_res_blocks):
-                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    downsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-            # downsample block
-            if i != len(dim_mult) - 1:
-                mode = 'downsample3d' if temperal_downsample[
-                    i] else 'downsample2d'
-                downsamples.append(Resample(out_dim, mode=mode))
-                scale /= 2.0
-        self.downsamples = nn.Sequential(*downsamples)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
-            ResidualBlock(out_dim, out_dim, dropout))
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, z_dim, 3, padding=1))
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## downsamples
-        for layer in self.downsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-class Decoder3d(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_upsample=[False, True, True],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2**(len(dim_mult) - 2)
-        # init block
-        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
-            ResidualBlock(dims[0], dims[0], dropout))
-        # upsample blocks
-        upsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            if i == 1 or i == 2 or i == 3:
-                in_dim = in_dim // 2
-            for _ in range(num_res_blocks + 1):
-                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    upsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-            # upsample block
-            if i != len(dim_mult) - 1:
-                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
-                upsamples.append(Resample(out_dim, mode=mode))
-                scale *= 2.0
-        self.upsamples = nn.Sequential(*upsamples)
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, 3, 3, padding=1))
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        ## conv1
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## upsamples
-        for layer in self.upsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-def count_conv3d(model):
-    count = 0
-    for m in model.modules():
-        if isinstance(m, CausalConv3d):
-            count += 1
-    return count
-class AutoencoderKLWan_(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-        # modules
-        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_downsample, dropout)
-        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
-        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_upsample, dropout)
-    def forward(self, x):
-        mu, log_var = self.encode(x)
-        z = self.reparameterize(mu, log_var)
-        x_recon = self.decode(z)
-        return x_recon, mu, log_var
-    def encode(self, x, scale=None):
-        self.clear_cache()
-        ## cache
-        t = x.shape[2]
-        iter_ = 1 + (t - 1) // 4
-        if scale != None:
-            scale = [item.to(x.device, x.dtype) for item in scale]
-        ## 对encode输入的x，按时间拆分为1、4、4、4....
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(
-                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-                out = torch.cat([out, out_], 2)
-        mu, log_var = self.conv1(out).chunk(2, dim=1)
-        if scale != None:
-            if isinstance(scale[0], torch.Tensor):
-                mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
-                    1, self.z_dim, 1, 1, 1)
-            else:
-                mu = (mu - scale[0]) * scale[1]
-        x = torch.cat([mu, log_var], dim = 1)
-        self.clear_cache()
-        return x
-    def decode(self, z, scale=None):
-        self.clear_cache()
-        # z: [b,c,t,h,w]
-        if scale != None:
-            scale = [item.to(z.device, z.dtype) for item in scale]
-            if isinstance(scale[0], torch.Tensor):
-                z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
-                    1, self.z_dim, 1, 1, 1)
-            else:
-                z = z / scale[1] + scale[0]
-        iter_ = z.shape[2]
-        x = self.conv2(z)
-        for i in range(iter_):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-            else:
-                out_ = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-                out = torch.cat([out, out_], 2)
-        self.clear_cache()
-        return out
-    def reparameterize(self, mu, log_var):
-        std = torch.exp(0.5 * log_var)
-        eps = torch.randn_like(std)
-        return eps * std + mu
-    def sample(self, imgs, deterministic=False):
-        mu, log_var = self.encode(imgs)
-        if deterministic:
-            return mu
-        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
-        return mu + std * torch.randn_like(std)
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        #cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
-def _video_vae(z_dim=None, **kwargs):
-    """
-    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
-    """
-    # params
-    cfg = dict(
-        dim=96,
-        z_dim=z_dim,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[False, True, True],
-        dropout=0.0)
-    cfg.update(**kwargs)
-    # init model
-    model = AutoencoderKLWan_(**cfg)
-    return model
-class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        latent_channels=16,
-        temporal_compression_ratio=4,
-        spatial_compression_ratio=8
-    ):
-        super().__init__()
-        mean = [
-            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
-            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
-        ]
-        std = [
-            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
-            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
-        ]
-        self.mean = torch.tensor(mean, dtype=torch.float32)
-        self.std = torch.tensor(std, dtype=torch.float32)
-        self.scale = [self.mean, 1.0 / self.std]
-        # init model
-        self.model = _video_vae(
-            z_dim=latent_channels,
-        )
-        self.gradient_checkpointing = False
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        x = [
-            self.model.encode(u.unsqueeze(0), self.scale).squeeze(0)
-            for u in x
-        ]
-        x = torch.stack(x)
-        return x
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, zs):
-        dec = [
-            self.model.decode(u.unsqueeze(0), self.scale).clamp_(-1, 1).squeeze(0)
-            for u in zs
-        ]
-        dec = torch.stack(dec)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, additional_kwargs={}):
-        def filter_kwargs(cls, kwargs):
-            import inspect
-            sig = inspect.signature(cls.__init__)
-            valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
-            filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            return filtered_kwargs
-        model = cls(**filter_kwargs(cls, additional_kwargs))
-        if pretrained_model_path.endswith(".safetensors"):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(pretrained_model_path)
-        else:
-            state_dict = torch.load(pretrained_model_path, map_location="cpu")
-        tmp_state_dict = {}
-        for key in state_dict:
-            tmp_state_dict["model." + key] = state_dict[key]
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        return model
-class AutoencoderKLWanCompileQwenImage(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    @register_to_config
-    def __init__(
-        self,
-        attn_scales = [],
-        base_dim = 96,
-        dim_mult = [
-            1,
-            2,
-            4,
-            4
-        ],
-        dropout = 0.0,
-        latents_mean = [
-            -0.7571,
-            -0.7089,
-            -0.9113,
-            0.1075,
-            -0.1745,
-            0.9653,
-            -0.1517,
-            1.5508,
-            0.4134,
-            -0.0715,
-            0.5517,
-            -0.3632,
-            -0.1922,
-            -0.9497,
-            0.2503,
-            -0.2921
-        ],
-        latents_std = [
-            2.8184,
-            1.4541,
-            2.3275,
-            2.6558,
-            1.2196,
-            1.7708,
-            2.6052,
-            2.0743,
-            3.2687,
-            2.1526,
-            2.8652,
-            1.5579,
-            1.6382,
-            1.1253,
-            2.8251,
-            1.916
-        ],
-        num_res_blocks = 2,
-        temperal_downsample = [
-            False,
-            True,
-            True
-        ],
-        z_dim = 16
-    ):
-        super().__init__()
-        cfg = dict(
-            dim=base_dim,
-            z_dim=z_dim,
-            dim_mult=dim_mult,
-            num_res_blocks=num_res_blocks,
-            attn_scales=attn_scales,
-            temperal_downsample=temperal_downsample,
-            dropout=dropout)
-        # init model
-        self.model = AutoencoderKLWan_(**cfg)
-        self.dim = base_dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        x = [
-            self.model.encode(u.unsqueeze(0)).squeeze(0)
-            for u in x
-        ]
-        x = torch.stack(x)
-        return x
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, zs):
-        dec = [
-            self.model.decode(u.unsqueeze(0)).clamp_(-1, 1).squeeze(0)
-            for u in zs
-        ]
-        dec = torch.stack(dec)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, additional_kwargs={}):
-        def filter_kwargs(cls, kwargs):
-            import inspect
-            sig = inspect.signature(cls.__init__)
-            valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
-            filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            return filtered_kwargs
-        model = cls(**filter_kwargs(cls, additional_kwargs))
-        if pretrained_model_path.endswith(".safetensors"):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(pretrained_model_path)
-        else:
-            state_dict = torch.load(pretrained_model_path, map_location="cpu")
-        tmp_state_dict = {}
-        for key in state_dict:
-            tmp_state_dict["model." + key] = state_dict[key]
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        return model

videox_fun/models/wan_vae3_8.py DELETED Viewed

@@ -1,1091 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import logging
-from typing import Tuple, Union
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.models.autoencoders.vae import (DecoderOutput,
-                                               DiagonalGaussianDistribution)
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from einops import rearrange
-CACHE_T = 2
-class CausalConv3d(nn.Conv3d):
-    """
-    Causal 3d convolusion.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._padding = (
-            self.padding[2],
-            self.padding[2],
-            self.padding[1],
-            self.padding[1],
-            2 * self.padding[0],
-            0,
-        )
-        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-        return super().forward(x)
-class RMS_norm(nn.Module):
-    def __init__(self, dim, channel_first=True, images=True, bias=False):
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
-    def forward(self, x):
-        return (F.normalize(x, dim=(1 if self.channel_first else -1)) *
-                self.scale * self.gamma + self.bias)
-class Upsample(nn.Upsample):
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-class Resample(nn.Module):
-    def __init__(self, dim, mode):
-        assert mode in (
-            "none",
-            "upsample2d",
-            "upsample3d",
-            "downsample2d",
-            "downsample3d",
-        )
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-        # layers
-        if mode == "upsample2d":
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim, 3, padding=1),
-            )
-        elif mode == "upsample3d":
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim, 3, padding=1),
-                # nn.Conv2d(dim, dim//2, 3, padding=1)
-            )
-            self.time_conv = CausalConv3d(
-                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-        elif mode == "downsample2d":
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == "downsample3d":
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = CausalConv3d(
-                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        else:
-            self.resample = nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == "upsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = "Rep"
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
-                            feat_cache[idx] != "Rep"):
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat(
-                            [
-                                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                    cache_x.device),
-                                cache_x,
-                            ],
-                            dim=2,
-                        )
-                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
-                            feat_cache[idx] == "Rep"):
-                        cache_x = torch.cat(
-                            [
-                                torch.zeros_like(cache_x).to(cache_x.device),
-                                cache_x
-                            ],
-                            dim=2,
-                        )
-                    if feat_cache[idx] == "Rep":
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
-                                    3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = rearrange(x, "b c t h w -> (b t) c h w")
-        x = self.resample(x)
-        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
-        if self.mode == "downsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    x = self.time_conv(
-                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-    def init_weight(self, conv):
-        conv_weight = conv.weight.detach().clone()
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        one_matrix = torch.eye(c1, c2)
-        init_matrix = one_matrix
-        nn.init.zeros_(conv_weight)
-        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
-        conv.weight = nn.Parameter(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-    def init_weight2(self, conv):
-        conv_weight = conv.weight.data.detach().clone()
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        init_matrix = torch.eye(c1 // 2, c2)
-        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
-        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
-        conv.weight = nn.Parameter(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-class ResidualBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, dropout=0.0):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # layers
-        self.residual = nn.Sequential(
-            RMS_norm(in_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(in_dim, out_dim, 3, padding=1),
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            CausalConv3d(out_dim, out_dim, 3, padding=1),
-        )
-        self.shortcut = (
-            CausalConv3d(in_dim, out_dim, 1)
-            if in_dim != out_dim else nn.Identity())
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
-        for layer in self.residual:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x + h
-class AttentionBlock(nn.Module):
-    """
-    Causal self-attention with a single head.
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        # layers
-        self.norm = RMS_norm(dim)
-        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
-        self.proj = nn.Conv2d(dim, dim, 1)
-        # zero out the last layer params
-        nn.init.zeros_(self.proj.weight)
-    def forward(self, x):
-        identity = x
-        b, c, t, h, w = x.size()
-        x = rearrange(x, "b c t h w -> (b t) c h w")
-        x = self.norm(x)
-        # compute query, key, value
-        q, k, v = (
-            self.to_qkv(x).reshape(b * t, 1, c * 3,
-                                   -1).permute(0, 1, 3,
-                                               2).contiguous().chunk(3, dim=-1))
-        # apply attention
-        x = F.scaled_dot_product_attention(
-            q,
-            k,
-            v,
-        )
-        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
-        # output
-        x = self.proj(x)
-        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
-        return x + identity
-def patchify(x, patch_size):
-    if patch_size == 1:
-        return x
-    if x.dim() == 4:
-        x = rearrange(
-            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
-    elif x.dim() == 5:
-        x = rearrange(
-            x,
-            "b c f (h q) (w r) -> b (c r q) f h w",
-            q=patch_size,
-            r=patch_size,
-        )
-    else:
-        raise ValueError(f"Invalid input shape: {x.shape}")
-    return x
-def unpatchify(x, patch_size):
-    if patch_size == 1:
-        return x
-    if x.dim() == 4:
-        x = rearrange(
-            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
-    elif x.dim() == 5:
-        x = rearrange(
-            x,
-            "b (c r q) f h w -> b c f (h q) (w r)",
-            q=patch_size,
-            r=patch_size,
-        )
-    return x
-class AvgDown3D(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        factor_t,
-        factor_s=1,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.factor_t = factor_t
-        self.factor_s = factor_s
-        self.factor = self.factor_t * self.factor_s * self.factor_s
-        assert in_channels * self.factor % out_channels == 0
-        self.group_size = in_channels * self.factor // out_channels
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
-        pad = (0, 0, 0, 0, pad_t, 0)
-        x = F.pad(x, pad)
-        B, C, T, H, W = x.shape
-        x = x.view(
-            B,
-            C,
-            T // self.factor_t,
-            self.factor_t,
-            H // self.factor_s,
-            self.factor_s,
-            W // self.factor_s,
-            self.factor_s,
-        )
-        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
-        x = x.view(
-            B,
-            C * self.factor,
-            T // self.factor_t,
-            H // self.factor_s,
-            W // self.factor_s,
-        )
-        x = x.view(
-            B,
-            self.out_channels,
-            self.group_size,
-            T // self.factor_t,
-            H // self.factor_s,
-            W // self.factor_s,
-        )
-        x = x.mean(dim=2)
-        return x
-class DupUp3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        factor_t,
-        factor_s=1,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.factor_t = factor_t
-        self.factor_s = factor_s
-        self.factor = self.factor_t * self.factor_s * self.factor_s
-        assert out_channels * self.factor % in_channels == 0
-        self.repeats = out_channels * self.factor // in_channels
-    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
-        x = x.repeat_interleave(self.repeats, dim=1)
-        x = x.view(
-            x.size(0),
-            self.out_channels,
-            self.factor_t,
-            self.factor_s,
-            self.factor_s,
-            x.size(2),
-            x.size(3),
-            x.size(4),
-        )
-        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
-        x = x.view(
-            x.size(0),
-            self.out_channels,
-            x.size(2) * self.factor_t,
-            x.size(4) * self.factor_s,
-            x.size(6) * self.factor_s,
-        )
-        if first_chunk:
-            x = x[:, :, self.factor_t - 1:, :, :]
-        return x
-class Down_ResidualBlock(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 dropout,
-                 mult,
-                 temperal_downsample=False,
-                 down_flag=False):
-        super().__init__()
-        # Shortcut path with downsample
-        self.avg_shortcut = AvgDown3D(
-            in_dim,
-            out_dim,
-            factor_t=2 if temperal_downsample else 1,
-            factor_s=2 if down_flag else 1,
-        )
-        # Main path with residual blocks and downsample
-        downsamples = []
-        for _ in range(mult):
-            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-            in_dim = out_dim
-        # Add the final downsample block
-        if down_flag:
-            mode = "downsample3d" if temperal_downsample else "downsample2d"
-            downsamples.append(Resample(out_dim, mode=mode))
-        self.downsamples = nn.Sequential(*downsamples)
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        x_copy = x.clone()
-        for module in self.downsamples:
-            x = module(x, feat_cache, feat_idx)
-        return x + self.avg_shortcut(x_copy)
-class Up_ResidualBlock(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 dropout,
-                 mult,
-                 temperal_upsample=False,
-                 up_flag=False):
-        super().__init__()
-        # Shortcut path with upsample
-        if up_flag:
-            self.avg_shortcut = DupUp3D(
-                in_dim,
-                out_dim,
-                factor_t=2 if temperal_upsample else 1,
-                factor_s=2 if up_flag else 1,
-            )
-        else:
-            self.avg_shortcut = None
-        # Main path with residual blocks and upsample
-        upsamples = []
-        for _ in range(mult):
-            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-            in_dim = out_dim
-        # Add the final upsample block
-        if up_flag:
-            mode = "upsample3d" if temperal_upsample else "upsample2d"
-            upsamples.append(Resample(out_dim, mode=mode))
-        self.upsamples = nn.Sequential(*upsamples)
-    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
-        x_main = x.clone()
-        for module in self.upsamples:
-            x_main = module(x_main, feat_cache, feat_idx)
-        if self.avg_shortcut is not None:
-            x_shortcut = self.avg_shortcut(x, first_chunk)
-            return x_main + x_shortcut
-        else:
-            return x_main
-class Encoder3d(nn.Module):
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, False],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-        # init block
-        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
-        # downsample blocks
-        downsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            t_down_flag = (
-                temperal_downsample[i]
-                if i < len(temperal_downsample) else False)
-            downsamples.append(
-                Down_ResidualBlock(
-                    in_dim=in_dim,
-                    out_dim=out_dim,
-                    dropout=dropout,
-                    mult=num_res_blocks,
-                    temperal_downsample=t_down_flag,
-                    down_flag=i != len(dim_mult) - 1,
-                ))
-            scale /= 2.0
-        self.downsamples = nn.Sequential(*downsamples)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(out_dim, out_dim, dropout),
-            AttentionBlock(out_dim),
-            ResidualBlock(out_dim, out_dim, dropout),
-        )
-        # # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(out_dim, z_dim, 3, padding=1),
-        )
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat(
-                    [
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device),
-                        cache_x,
-                    ],
-                    dim=2,
-                )
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## downsamples
-        for layer in self.downsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-class Decoder3d(nn.Module):
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_upsample=[False, True, True],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2**(len(dim_mult) - 2)
-        # init block
-        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(dims[0], dims[0], dropout),
-            AttentionBlock(dims[0]),
-            ResidualBlock(dims[0], dims[0], dropout),
-        )
-        # upsample blocks
-        upsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            t_up_flag = temperal_upsample[i] if i < len(
-                temperal_upsample) else False
-            upsamples.append(
-                Up_ResidualBlock(
-                    in_dim=in_dim,
-                    out_dim=out_dim,
-                    dropout=dropout,
-                    mult=num_res_blocks + 1,
-                    temperal_upsample=t_up_flag,
-                    up_flag=i != len(dim_mult) - 1,
-                ))
-        self.upsamples = nn.Sequential(*upsamples)
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(out_dim, 12, 3, padding=1),
-        )
-    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat(
-                    [
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device),
-                        cache_x,
-                    ],
-                    dim=2,
-                )
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## upsamples
-        for layer in self.upsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx, first_chunk)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-def count_conv3d(model):
-    count = 0
-    for m in model.modules():
-        if isinstance(m, CausalConv3d):
-            count += 1
-    return count
-class AutoencoderKLWan2_2_(nn.Module):
-    def __init__(
-        self,
-        dim=160,
-        dec_dim=256,
-        z_dim=16,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, False],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-        # modules
-        self.encoder = Encoder3d(
-            dim,
-            z_dim * 2,
-            dim_mult,
-            num_res_blocks,
-            attn_scales,
-            self.temperal_downsample,
-            dropout,
-        )
-        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
-        self.decoder = Decoder3d(
-            dec_dim,
-            z_dim,
-            dim_mult,
-            num_res_blocks,
-            attn_scales,
-            self.temperal_upsample,
-            dropout,
-        )
-    def forward(self, x, scale=[0, 1]):
-        mu = self.encode(x, scale)
-        x_recon = self.decode(mu, scale)
-        return x_recon, mu
-    def encode(self, x, scale):
-        self.clear_cache()
-        # z: [b,c,t,h,w]
-        scale = [item.to(x.device, x.dtype) for item in scale]
-        x = patchify(x, patch_size=2)
-        t = x.shape[2]
-        iter_ = 1 + (t - 1) // 4
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(
-                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx,
-                )
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx,
-                )
-                out = torch.cat([out, out_], 2)
-        mu, log_var = self.conv1(out).chunk(2, dim=1)
-        if isinstance(scale[0], torch.Tensor):
-            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            mu = (mu - scale[0]) * scale[1]
-        x = torch.cat([mu, log_var], dim = 1)
-        self.clear_cache()
-        return x
-    def decode(self, z, scale):
-        self.clear_cache()
-        # z: [b,c,t,h,w]
-        scale = [item.to(z.device, z.dtype) for item in scale]
-        if isinstance(scale[0], torch.Tensor):
-            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            z = z / scale[1] + scale[0]
-        iter_ = z.shape[2]
-        x = self.conv2(z)
-        for i in range(iter_):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx,
-                    first_chunk=True,
-                )
-            else:
-                out_ = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx,
-                )
-                out = torch.cat([out, out_], 2)
-        out = unpatchify(out, patch_size=2)
-        self.clear_cache()
-        return out
-    def reparameterize(self, mu, log_var):
-        std = torch.exp(0.5 * log_var)
-        eps = torch.randn_like(std)
-        return eps * std + mu
-    def sample(self, imgs, deterministic=False):
-        mu, log_var = self.encode(imgs)
-        if deterministic:
-            return mu
-        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
-        return mu + std * torch.randn_like(std)
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        # cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
-def _video_vae(pretrained_path=None, z_dim=16, dim=160, device="cpu", **kwargs):
-    # params
-    cfg = dict(
-        dim=dim,
-        z_dim=z_dim,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, True],
-        dropout=0.0,
-    )
-    cfg.update(**kwargs)
-    # init model
-    model = AutoencoderKLWan2_2_(**cfg)
-    return model
-class AutoencoderKLWan3_8(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        latent_channels=48,
-        c_dim=160,
-        vae_pth=None,
-        dim_mult=[1, 2, 4, 4],
-        temperal_downsample=[False, True, True],
-        temporal_compression_ratio=4,
-        spatial_compression_ratio=8
-    ):
-        super().__init__()
-        mean = torch.tensor(
-            [
-                -0.2289,
-                -0.0052,
-                -0.1323,
-                -0.2339,
-                -0.2799,
-                0.0174,
-                0.1838,
-                0.1557,
-                -0.1382,
-                0.0542,
-                0.2813,
-                0.0891,
-                0.1570,
-                -0.0098,
-                0.0375,
-                -0.1825,
-                -0.2246,
-                -0.1207,
-                -0.0698,
-                0.5109,
-                0.2665,
-                -0.2108,
-                -0.2158,
-                0.2502,
-                -0.2055,
-                -0.0322,
-                0.1109,
-                0.1567,
-                -0.0729,
-                0.0899,
-                -0.2799,
-                -0.1230,
-                -0.0313,
-                -0.1649,
-                0.0117,
-                0.0723,
-                -0.2839,
-                -0.2083,
-                -0.0520,
-                0.3748,
-                0.0152,
-                0.1957,
-                0.1433,
-                -0.2944,
-                0.3573,
-                -0.0548,
-                -0.1681,
-                -0.0667,
-            ], dtype=torch.float32
-        )
-        std = torch.tensor(
-            [
-                0.4765,
-                1.0364,
-                0.4514,
-                1.1677,
-                0.5313,
-                0.4990,
-                0.4818,
-                0.5013,
-                0.8158,
-                1.0344,
-                0.5894,
-                1.0901,
-                0.6885,
-                0.6165,
-                0.8454,
-                0.4978,
-                0.5759,
-                0.3523,
-                0.7135,
-                0.6804,
-                0.5833,
-                1.4146,
-                0.8986,
-                0.5659,
-                0.7069,
-                0.5338,
-                0.4889,
-                0.4917,
-                0.4069,
-                0.4999,
-                0.6866,
-                0.4093,
-                0.5709,
-                0.6065,
-                0.6415,
-                0.4944,
-                0.5726,
-                1.2042,
-                0.5458,
-                1.6887,
-                0.3971,
-                1.0600,
-                0.3943,
-                0.5537,
-                0.5444,
-                0.4089,
-                0.7468,
-                0.7744,
-            ], dtype=torch.float32
-        )
-        self.scale = [mean, 1.0 / std]
-        # init model
-        self.model = _video_vae(
-                pretrained_path=vae_pth,
-                z_dim=latent_channels,
-                dim=c_dim,
-                dim_mult=dim_mult,
-                temperal_downsample=temperal_downsample,
-            ).eval().requires_grad_(False)
-        self.gradient_checkpointing = False
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        x = [
-            self.model.encode(u.unsqueeze(0), self.scale).squeeze(0)
-            for u in x
-        ]
-        x = torch.stack(x)
-        return x
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        h = self._encode(x)
-        posterior = DiagonalGaussianDistribution(h)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, zs):
-        dec = [
-            self.model.decode(u.unsqueeze(0), self.scale).clamp_(-1, 1).squeeze(0)
-            for u in zs
-        ]
-        dec = torch.stack(dec)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_path, additional_kwargs={}):
-        def filter_kwargs(cls, kwargs):
-            import inspect
-            sig = inspect.signature(cls.__init__)
-            valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
-            filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
-            return filtered_kwargs
-        model = cls(**filter_kwargs(cls, additional_kwargs))
-        if pretrained_model_path.endswith(".safetensors"):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(pretrained_model_path)
-        else:
-            state_dict = torch.load(pretrained_model_path, map_location="cpu")
-        tmp_state_dict = {}
-        for key in state_dict:
-            tmp_state_dict["model." + key] = state_dict[key]
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m, u)
-        return model

videox_fun/models/wan_xlm_roberta.py DELETED Viewed

@@ -1,170 +0,0 @@
-# Modified from transformers.models.xlm_roberta.modeling_xlm_roberta
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-__all__ = ['XLMRoberta', 'xlm_roberta_large']
-class SelfAttention(nn.Module):
-    def __init__(self, dim, num_heads, dropout=0.1, eps=1e-5):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.eps = eps
-        # layers
-        self.q = nn.Linear(dim, dim)
-        self.k = nn.Linear(dim, dim)
-        self.v = nn.Linear(dim, dim)
-        self.o = nn.Linear(dim, dim)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x, mask):
-        """
-        x:   [B, L, C].
-        """
-        b, s, c, n, d = *x.size(), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.q(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
-        k = self.k(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
-        v = self.v(x).reshape(b, s, n, d).permute(0, 2, 1, 3)
-        # compute attention
-        p = self.dropout.p if self.training else 0.0
-        x = F.scaled_dot_product_attention(q, k, v, mask, p)
-        x = x.permute(0, 2, 1, 3).reshape(b, s, c)
-        # output
-        x = self.o(x)
-        x = self.dropout(x)
-        return x
-class AttentionBlock(nn.Module):
-    def __init__(self, dim, num_heads, post_norm, dropout=0.1, eps=1e-5):
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.post_norm = post_norm
-        self.eps = eps
-        # layers
-        self.attn = SelfAttention(dim, num_heads, dropout, eps)
-        self.norm1 = nn.LayerNorm(dim, eps=eps)
-        self.ffn = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim),
-            nn.Dropout(dropout))
-        self.norm2 = nn.LayerNorm(dim, eps=eps)
-    def forward(self, x, mask):
-        if self.post_norm:
-            x = self.norm1(x + self.attn(x, mask))
-            x = self.norm2(x + self.ffn(x))
-        else:
-            x = x + self.attn(self.norm1(x), mask)
-            x = x + self.ffn(self.norm2(x))
-        return x
-class XLMRoberta(nn.Module):
-    """
-    XLMRobertaModel with no pooler and no LM head.
-    """
-    def __init__(self,
-                 vocab_size=250002,
-                 max_seq_len=514,
-                 type_size=1,
-                 pad_id=1,
-                 dim=1024,
-                 num_heads=16,
-                 num_layers=24,
-                 post_norm=True,
-                 dropout=0.1,
-                 eps=1e-5):
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.max_seq_len = max_seq_len
-        self.type_size = type_size
-        self.pad_id = pad_id
-        self.dim = dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.post_norm = post_norm
-        self.eps = eps
-        # embeddings
-        self.token_embedding = nn.Embedding(vocab_size, dim, padding_idx=pad_id)
-        self.type_embedding = nn.Embedding(type_size, dim)
-        self.pos_embedding = nn.Embedding(max_seq_len, dim, padding_idx=pad_id)
-        self.dropout = nn.Dropout(dropout)
-        # blocks
-        self.blocks = nn.ModuleList([
-            AttentionBlock(dim, num_heads, post_norm, dropout, eps)
-            for _ in range(num_layers)
-        ])
-        # norm layer
-        self.norm = nn.LayerNorm(dim, eps=eps)
-    def forward(self, ids):
-        """
-        ids: [B, L] of torch.LongTensor.
-        """
-        b, s = ids.shape
-        mask = ids.ne(self.pad_id).long()
-        # embeddings
-        x = self.token_embedding(ids) + \
-            self.type_embedding(torch.zeros_like(ids)) + \
-            self.pos_embedding(self.pad_id + torch.cumsum(mask, dim=1) * mask)
-        if self.post_norm:
-            x = self.norm(x)
-        x = self.dropout(x)
-        # blocks
-        mask = torch.where(
-            mask.view(b, 1, 1, s).gt(0), 0.0,
-            torch.finfo(x.dtype).min)
-        for block in self.blocks:
-            x = block(x, mask)
-        # output
-        if not self.post_norm:
-            x = self.norm(x)
-        return x
-def xlm_roberta_large(pretrained=False,
-                      return_tokenizer=False,
-                      device='cpu',
-                      **kwargs):
-    """
-    XLMRobertaLarge adapted from Huggingface.
-    """
-    # params
-    cfg = dict(
-        vocab_size=250002,
-        max_seq_len=514,
-        type_size=1,
-        pad_id=1,
-        dim=1024,
-        num_heads=16,
-        num_layers=24,
-        post_norm=True,
-        dropout=0.1,
-        eps=1e-5)
-    cfg.update(**kwargs)
-    # init a model on device
-    with torch.device(device):
-        model = XLMRoberta(**cfg)
-    return model

videox_fun/models/z_image_transformer2d.py DELETED Viewed

@@ -1,1050 +0,0 @@
-# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import inspect
-import json
-import os
-import math
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
-from diffusers.models.attention_processor import Attention
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import RMSNorm
-from diffusers.utils.torch_utils import maybe_allow_in_graph
-from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
-                             scale_lora_layers, unscale_lora_layers)
-from .attention_utils import attention
-from ..dist import (ZMultiGPUsSingleStreamAttnProcessor, get_sequence_parallel_rank,
-                    get_sequence_parallel_world_size, get_sp_group)
-ADALN_EMBED_DIM = 256
-SEQ_MULTI_OF = 32
-class TimestepEmbedder(nn.Module):
-    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
-        super().__init__()
-        if mid_size is None:
-            mid_size = out_size
-        self.mlp = nn.Sequential(
-            nn.Linear(
-                frequency_embedding_size,
-                mid_size,
-                bias=True,
-            ),
-            nn.SiLU(),
-            nn.Linear(
-                mid_size,
-                out_size,
-                bias=True,
-            ),
-        )
-        self.frequency_embedding_size = frequency_embedding_size
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        with torch.amp.autocast("cuda", enabled=False):
-            half = dim // 2
-            freqs = torch.exp(
-                -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
-            )
-            args = t[:, None].float() * freqs[None]
-            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-            if dim % 2:
-                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-            return embedding
-    def forward(self, t):
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
-        weight_dtype = self.mlp[0].weight.dtype
-        if weight_dtype.is_floating_point:
-            t_freq = t_freq.to(weight_dtype)
-        t_emb = self.mlp(t_freq)
-        return t_emb
-class ZSingleStreamAttnProcessor:
-    """
-    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
-    original Z-ImageAttention module.
-    """
-    _attention_backend = None
-    _parallel_config = None
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
-            )
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        freqs_cis: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-        query = query.unflatten(-1, (attn.heads, -1))
-        key = key.unflatten(-1, (attn.heads, -1))
-        value = value.unflatten(-1, (attn.heads, -1))
-        # Apply Norms
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-        # Apply RoPE
-        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
-            with torch.amp.autocast("cuda", enabled=False):
-                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
-                freqs_cis = freqs_cis.unsqueeze(2)
-                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
-                return x_out.type_as(x_in)  # todo
-        if freqs_cis is not None:
-            query = apply_rotary_emb(query, freqs_cis)
-            key = apply_rotary_emb(key, freqs_cis)
-        # Cast to correct dtype
-        dtype = query.dtype
-        query, key = query.to(dtype), key.to(dtype)
-        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
-        if attention_mask is not None and attention_mask.ndim == 2:
-            attention_mask = attention_mask[:, None, None, :]
-        # Compute joint attention
-        hidden_states = attention(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask
-        )
-        # Reshape back
-        hidden_states = hidden_states.flatten(2, 3)
-        hidden_states = hidden_states.to(dtype)
-        output = attn.to_out[0](hidden_states)
-        if len(attn.to_out) > 1:  # dropout
-            output = attn.to_out[1](output)
-        return output
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int):
-        super().__init__()
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-    def _forward_silu_gating(self, x1, x3):
-        return F.silu(x1) * x3
-    def forward(self, x):
-        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
-@maybe_allow_in_graph
-class ZImageTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        dim: int,
-        n_heads: int,
-        n_kv_heads: int,
-        norm_eps: float,
-        qk_norm: bool,
-        modulation=True,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.head_dim = dim // n_heads
-        # Refactored to use diffusers Attention with custom processor
-        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
-        self.attention = Attention(
-            query_dim=dim,
-            cross_attention_dim=None,
-            dim_head=dim // n_heads,
-            heads=n_heads,
-            qk_norm="rms_norm" if qk_norm else None,
-            eps=1e-5,
-            bias=False,
-            out_bias=False,
-            processor=ZSingleStreamAttnProcessor(),
-        )
-        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
-        self.layer_id = layer_id
-        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
-        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
-        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
-        self.modulation = modulation
-        if modulation:
-            self.adaLN_modulation = nn.Sequential(
-                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
-            )
-    def forward(
-        self,
-        x: torch.Tensor,
-        attn_mask: torch.Tensor,
-        freqs_cis: torch.Tensor,
-        adaln_input: Optional[torch.Tensor] = None,
-    ):
-        if self.modulation:
-            assert adaln_input is not None
-            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
-            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
-            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
-            # Attention block
-            attn_out = self.attention(
-                self.attention_norm1(x) * scale_msa,
-                attention_mask=attn_mask,
-                freqs_cis=freqs_cis,
-            )
-            x = x + gate_msa * self.attention_norm2(attn_out)
-            # FFN block
-            x = x + gate_mlp * self.ffn_norm2(
-                self.feed_forward(
-                    self.ffn_norm1(x) * scale_mlp,
-                )
-            )
-        else:
-            # Attention block
-            attn_out = self.attention(
-                self.attention_norm1(x),
-                attention_mask=attn_mask,
-                freqs_cis=freqs_cis,
-            )
-            x = x + self.attention_norm2(attn_out)
-            # FFN block
-            x = x + self.ffn_norm2(
-                self.feed_forward(
-                    self.ffn_norm1(x),
-                )
-            )
-        return x
-class FinalLayer(nn.Module):
-    def __init__(self, hidden_size, out_channels):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
-        )
-    def forward(self, x, c):
-        scale = 1.0 + self.adaLN_modulation(c)
-        x = self.norm_final(x) * scale.unsqueeze(1)
-        x = self.linear(x)
-        return x
-class RopeEmbedder:
-    def __init__(
-        self,
-        theta: float = 256.0,
-        axes_dims: List[int] = (16, 56, 56),
-        axes_lens: List[int] = (64, 128, 128),
-    ):
-        self.theta = theta
-        self.axes_dims = axes_dims
-        self.axes_lens = axes_lens
-        assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must have the same length"
-        self.freqs_cis = None
-    @staticmethod
-    def precompute_freqs_cis(dim: List[int], end: List[int], theta: float = 256.0):
-        with torch.device("cpu"):
-            freqs_cis = []
-            for i, (d, e) in enumerate(zip(dim, end)):
-                freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d))
-                timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
-                freqs = torch.outer(timestep, freqs).float()
-                freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(torch.complex64)  # complex64
-                freqs_cis.append(freqs_cis_i)
-            return freqs_cis
-    def __call__(self, ids: torch.Tensor):
-        assert ids.ndim == 2
-        assert ids.shape[-1] == len(self.axes_dims)
-        device = ids.device
-        if self.freqs_cis is None:
-            self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
-            self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
-        else:
-            # Ensure freqs_cis are on the same device as ids
-            if self.freqs_cis[0].device != device:
-                self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
-        result = []
-        for i in range(len(self.axes_dims)):
-            index = ids[:, i]
-            result.append(self.freqs_cis[i][index])
-        return torch.cat(result, dim=-1)
-class ZImageTransformer2DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
-    _supports_gradient_checkpointing = True
-    # _no_split_modules = ["ZImageTransformerBlock"]
-    # _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"]  # precision sensitive layers
-    @register_to_config
-    def __init__(
-        self,
-        all_patch_size=(2,),
-        all_f_patch_size=(1,),
-        in_channels=16,
-        dim=3840,
-        n_layers=30,
-        n_refiner_layers=2,
-        n_heads=30,
-        n_kv_heads=30,
-        norm_eps=1e-5,
-        qk_norm=True,
-        cap_feat_dim=2560,
-        rope_theta=256.0,
-        t_scale=1000.0,
-        axes_dims=[32, 48, 48],
-        axes_lens=[1024, 512, 512],
-    ) -> None:
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = in_channels
-        self.all_patch_size = all_patch_size
-        self.all_f_patch_size = all_f_patch_size
-        self.dim = dim
-        self.n_heads = n_heads
-        self.rope_theta = rope_theta
-        self.t_scale = t_scale
-        self.gradient_checkpointing = False
-        assert len(all_patch_size) == len(all_f_patch_size)
-        all_x_embedder = {}
-        all_final_layer = {}
-        for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
-            x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * in_channels, dim, bias=True)
-            all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
-            final_layer = FinalLayer(dim, patch_size * patch_size * f_patch_size * self.out_channels)
-            all_final_layer[f"{patch_size}-{f_patch_size}"] = final_layer
-        self.all_x_embedder = nn.ModuleDict(all_x_embedder)
-        self.all_final_layer = nn.ModuleDict(all_final_layer)
-        self.noise_refiner = nn.ModuleList(
-            [
-                ZImageTransformerBlock(
-                    1000 + layer_id,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    norm_eps,
-                    qk_norm,
-                    modulation=True,
-                )
-                for layer_id in range(n_refiner_layers)
-            ]
-        )
-        self.context_refiner = nn.ModuleList(
-            [
-                ZImageTransformerBlock(
-                    layer_id,
-                    dim,
-                    n_heads,
-                    n_kv_heads,
-                    norm_eps,
-                    qk_norm,
-                    modulation=False,
-                )
-                for layer_id in range(n_refiner_layers)
-            ]
-        )
-        self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
-        self.cap_embedder = nn.Sequential(
-            RMSNorm(cap_feat_dim, eps=norm_eps),
-            nn.Linear(cap_feat_dim, dim, bias=True),
-        )
-        self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
-        self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
-        self.layers = nn.ModuleList(
-            [
-                ZImageTransformerBlock(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm)
-                for layer_id in range(n_layers)
-            ]
-        )
-        head_dim = dim // n_heads
-        assert head_dim == sum(axes_dims)
-        self.axes_dims = axes_dims
-        self.axes_lens = axes_lens
-        self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=axes_dims, axes_lens=axes_lens)
-        self.sp_world_size = 1
-        self.sp_world_rank = 0
-    def _set_gradient_checkpointing(self, *args, **kwargs):
-        if "value" in kwargs:
-            self.gradient_checkpointing = kwargs["value"]
-        elif "enable" in kwargs:
-            self.gradient_checkpointing = kwargs["enable"]
-        else:
-            raise ValueError("Invalid set gradient checkpointing")
-    def enable_multi_gpus_inference(self,):
-        self.sp_world_size = get_sequence_parallel_world_size()
-        self.sp_world_rank = get_sequence_parallel_rank()
-        self.all_gather = get_sp_group().all_gather
-        self.set_attn_processor(ZMultiGPUsSingleStreamAttnProcessor())
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    def unpatchify(self, x: List[torch.Tensor], size: List[Tuple], patch_size, f_patch_size) -> List[torch.Tensor]:
-        pH = pW = patch_size
-        pF = f_patch_size
-        bsz = len(x)
-        assert len(size) == bsz
-        for i in range(bsz):
-            F, H, W = size[i]
-            ori_len = (F // pF) * (H // pH) * (W // pW)
-            # "f h w pf ph pw c -> c (f pf) (h ph) (w pw)"
-            x[i] = (
-                x[i][:ori_len]
-                .view(F // pF, H // pH, W // pW, pF, pH, pW, self.out_channels)
-                .permute(6, 0, 3, 1, 4, 2, 5)
-                .reshape(self.out_channels, F, H, W)
-            )
-        return x
-    @staticmethod
-    def create_coordinate_grid(size, start=None, device=None):
-        if start is None:
-            start = (0 for _ in size)
-        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
-        grids = torch.meshgrid(axes, indexing="ij")
-        return torch.stack(grids, dim=-1)
-    def patchify(
-        self,
-        all_image: List[torch.Tensor],
-        patch_size: int,
-        f_patch_size: int,
-        cap_padding_len: int,
-    ):
-        pH = pW = patch_size
-        pF = f_patch_size
-        device = all_image[0].device
-        all_image_out = []
-        all_image_size = []
-        all_image_pos_ids = []
-        all_image_pad_mask = []
-        for i, image in enumerate(all_image):
-            ### Process Image
-            C, F, H, W = image.size()
-            all_image_size.append((F, H, W))
-            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
-            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
-            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
-            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
-            image_ori_len = len(image)
-            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
-            image_ori_pos_ids = self.create_coordinate_grid(
-                size=(F_tokens, H_tokens, W_tokens),
-                start=(cap_padding_len + 1, 0, 0),
-                device=device,
-            ).flatten(0, 2)
-            image_padding_pos_ids = (
-                self.create_coordinate_grid(
-                    size=(1, 1, 1),
-                    start=(0, 0, 0),
-                    device=device,
-                )
-                .flatten(0, 2)
-                .repeat(image_padding_len, 1)
-            )
-            image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
-            all_image_pos_ids.append(image_padded_pos_ids)
-            # pad mask
-            all_image_pad_mask.append(
-                torch.cat(
-                    [
-                        torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
-                        torch.ones((image_padding_len,), dtype=torch.bool, device=device),
-                    ],
-                    dim=0,
-                )
-            )
-            # padded feature
-            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
-            all_image_out.append(image_padded_feat)
-        return (
-            all_image_out,
-            all_image_size,
-            all_image_pos_ids,
-            all_image_pad_mask,
-        )
-    def patchify_and_embed(
-        self,
-        all_image: List[torch.Tensor],
-        all_cap_feats: List[torch.Tensor],
-        patch_size: int,
-        f_patch_size: int,
-    ):
-        pH = pW = patch_size
-        pF = f_patch_size
-        device = all_image[0].device
-        all_image_out = []
-        all_image_size = []
-        all_image_pos_ids = []
-        all_image_pad_mask = []
-        all_cap_pos_ids = []
-        all_cap_pad_mask = []
-        all_cap_feats_out = []
-        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
-            ### Process Caption
-            cap_ori_len = len(cap_feat)
-            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
-            # padded position ids
-            cap_padded_pos_ids = self.create_coordinate_grid(
-                size=(cap_ori_len + cap_padding_len, 1, 1),
-                start=(1, 0, 0),
-                device=device,
-            ).flatten(0, 2)
-            all_cap_pos_ids.append(cap_padded_pos_ids)
-            # pad mask
-            all_cap_pad_mask.append(
-                torch.cat(
-                    [
-                        torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
-                        torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
-                    ],
-                    dim=0,
-                )
-            )
-            # padded feature
-            cap_padded_feat = torch.cat(
-                [cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)],
-                dim=0,
-            )
-            all_cap_feats_out.append(cap_padded_feat)
-            ### Process Image
-            C, F, H, W = image.size()
-            all_image_size.append((F, H, W))
-            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
-            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
-            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
-            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
-            image_ori_len = len(image)
-            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
-            image_ori_pos_ids = self.create_coordinate_grid(
-                size=(F_tokens, H_tokens, W_tokens),
-                start=(cap_ori_len + cap_padding_len + 1, 0, 0),
-                device=device,
-            ).flatten(0, 2)
-            image_padding_pos_ids = (
-                self.create_coordinate_grid(
-                    size=(1, 1, 1),
-                    start=(0, 0, 0),
-                    device=device,
-                )
-                .flatten(0, 2)
-                .repeat(image_padding_len, 1)
-            )
-            image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
-            all_image_pos_ids.append(image_padded_pos_ids)
-            # pad mask
-            all_image_pad_mask.append(
-                torch.cat(
-                    [
-                        torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
-                        torch.ones((image_padding_len,), dtype=torch.bool, device=device),
-                    ],
-                    dim=0,
-                )
-            )
-            # padded feature
-            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
-            all_image_out.append(image_padded_feat)
-        return (
-            all_image_out,
-            all_cap_feats_out,
-            all_image_size,
-            all_image_pos_ids,
-            all_cap_pos_ids,
-            all_image_pad_mask,
-            all_cap_pad_mask,
-        )
-    def forward(
-        self,
-        x: List[torch.Tensor],
-        t,
-        cap_feats: List[torch.Tensor],
-        patch_size=2,
-        f_patch_size=1,
-    ):
-        assert patch_size in self.all_patch_size
-        assert f_patch_size in self.all_f_patch_size
-        bsz = len(x)
-        device = x[0].device
-        t = t * self.t_scale
-        t = self.t_embedder(t)
-        (
-            x,
-            cap_feats,
-            x_size,
-            x_pos_ids,
-            cap_pos_ids,
-            x_inner_pad_mask,
-            cap_inner_pad_mask,
-        ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
-        # x embed & refine
-        x_item_seqlens = [len(_) for _ in x]
-        assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
-        x_max_item_seqlen = max(x_item_seqlens)
-        x = torch.cat(x, dim=0)
-        x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
-        # Match t_embedder output dtype to x for layerwise casting compatibility
-        adaln_input = t.type_as(x)
-        x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
-        x = list(x.split(x_item_seqlens, dim=0))
-        x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split(x_item_seqlens, dim=0))
-        x = pad_sequence(x, batch_first=True, padding_value=0.0)
-        x_freqs_cis = pad_sequence(x_freqs_cis, batch_first=True, padding_value=0.0)
-        x_attn_mask = torch.zeros((bsz, x_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(x_item_seqlens):
-            x_attn_mask[i, :seq_len] = 1
-        # Context Parallel
-        if self.sp_world_size > 1:
-            x = torch.chunk(x, self.sp_world_size, dim=1)[self.sp_world_rank]
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer in self.noise_refiner:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                x = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    x, x_attn_mask, x_freqs_cis, adaln_input,
-                    **ckpt_kwargs,
-                )
-        else:
-            for layer in self.noise_refiner:
-                x = layer(x, x_attn_mask, x_freqs_cis, adaln_input)
-        # cap embed & refine
-        cap_item_seqlens = [len(_) for _ in cap_feats]
-        assert all(_ % SEQ_MULTI_OF == 0 for _ in cap_item_seqlens)
-        cap_max_item_seqlen = max(cap_item_seqlens)
-        cap_feats = torch.cat(cap_feats, dim=0)
-        cap_feats = self.cap_embedder(cap_feats)
-        cap_feats[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
-        cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
-        cap_freqs_cis = list(self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split(cap_item_seqlens, dim=0))
-        cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
-        cap_freqs_cis = pad_sequence(cap_freqs_cis, batch_first=True, padding_value=0.0)
-        cap_attn_mask = torch.zeros((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(cap_item_seqlens):
-            cap_attn_mask[i, :seq_len] = 1
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer in self.context_refiner:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                cap_feats = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    cap_feats,
-                    cap_attn_mask,
-                    cap_freqs_cis,
-                    **ckpt_kwargs,
-                )
-        else:
-            for layer in self.context_refiner:
-                cap_feats = layer(cap_feats, cap_attn_mask, cap_freqs_cis)
-        # unified
-        unified = []
-        unified_freqs_cis = []
-        for i in range(bsz):
-            x_len = x_item_seqlens[i]
-            cap_len = cap_item_seqlens[i]
-            unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
-            unified_freqs_cis.append(torch.cat([x_freqs_cis[i][:x_len], cap_freqs_cis[i][:cap_len]]))
-        unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens)]
-        assert unified_item_seqlens == [len(_) for _ in unified]
-        unified_max_item_seqlen = max(unified_item_seqlens)
-        unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
-        unified_freqs_cis = pad_sequence(unified_freqs_cis, batch_first=True, padding_value=0.0)
-        unified_attn_mask = torch.zeros((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
-        for i, seq_len in enumerate(unified_item_seqlens):
-            unified_attn_mask[i, :seq_len] = 1
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer in self.layers:
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                    return custom_forward
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                unified = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer),
-                    unified,
-                    unified_attn_mask,
-                    unified_freqs_cis,
-                    adaln_input,
-                    **ckpt_kwargs,
-                )
-        else:
-            for layer in self.layers:
-                unified = layer(unified, unified_attn_mask, unified_freqs_cis, adaln_input)
-        unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
-        unified = list(unified.unbind(dim=0))
-        x = self.unpatchify(unified, x_size, patch_size, f_patch_size)
-        if self.sp_world_size > 1:
-            x = self.all_gather(x, dim=1)
-        x = torch.stack(x)
-        return x, {}
-    @classmethod
-    def from_pretrained(
-        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
-        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16
-    ):
-        if subfolder is not None:
-            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
-        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
-        config_file = os.path.join(pretrained_model_path, 'config.json')
-        if not os.path.isfile(config_file):
-            raise RuntimeError(f"{config_file} does not exist")
-        with open(config_file, "r") as f:
-            config = json.load(f)
-        from diffusers.utils import WEIGHTS_NAME
-        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
-        model_file_safetensors = model_file.replace(".bin", ".safetensors")
-        if "dict_mapping" in transformer_additional_kwargs.keys():
-            for key in transformer_additional_kwargs["dict_mapping"]:
-                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
-        if low_cpu_mem_usage:
-            try:
-                import re
-                from diffusers import __version__ as diffusers_version
-                if diffusers_version >= "0.33.0":
-                    from diffusers.models.model_loading_utils import \
-                        load_model_dict_into_meta
-                else:
-                    from diffusers.models.modeling_utils import \
-                        load_model_dict_into_meta
-                from diffusers.utils import is_accelerate_available
-                if is_accelerate_available():
-                    import accelerate
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **transformer_additional_kwargs)
-                param_device = "cpu"
-                if os.path.exists(model_file):
-                    state_dict = torch.load(model_file, map_location="cpu")
-                elif os.path.exists(model_file_safetensors):
-                    from safetensors.torch import load_file, safe_open
-                    state_dict = load_file(model_file_safetensors)
-                else:
-                    from safetensors.torch import load_file, safe_open
-                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-                    state_dict = {}
-                    print(model_files_safetensors)
-                    for _model_file_safetensors in model_files_safetensors:
-                        _state_dict = load_file(_model_file_safetensors)
-                        for key in _state_dict:
-                            state_dict[key] = _state_dict[key]
-                filtered_state_dict = {}
-                for key in state_dict:
-                    if key in model.state_dict() and model.state_dict()[key].size() == state_dict[key].size():
-                        filtered_state_dict[key] = state_dict[key]
-                    else:
-                        print(f"Skipping key '{key}' due to size mismatch or absence in model.")
-                model_keys = set(model.state_dict().keys())
-                loaded_keys = set(filtered_state_dict.keys())
-                missing_keys = model_keys - loaded_keys
-                def initialize_missing_parameters(missing_keys, model_state_dict, torch_dtype=None):
-                    initialized_dict = {}
-                    with torch.no_grad():
-                        for key in missing_keys:
-                            param_shape = model_state_dict[key].shape
-                            param_dtype = torch_dtype if torch_dtype is not None else model_state_dict[key].dtype
-                            if "control" in key and key.replace("control_", "") in filtered_state_dict.keys():
-                                initialized_dict[key] = filtered_state_dict[key.replace("control_", "")].clone()
-                                print(f"Initializing missing parameter '{key}' with model.state_dict().")
-                            elif "after_proj" in key or "before_proj" in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                print(f"Initializing missing parameter '{key}' with zero.")
-                            elif 'weight' in key:
-                                if any(norm_type in key for norm_type in ['norm', 'ln_', 'layer_norm', 'group_norm', 'batch_norm']):
-                                    initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                                elif 'embedding' in key or 'embed' in key:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                                elif 'head' in key or 'output' in key or 'proj_out' in key:
-                                    initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                                elif len(param_shape) >= 2:
-                                    initialized_dict[key] = torch.empty(param_shape, dtype=param_dtype)
-                                    nn.init.xavier_uniform_(initialized_dict[key])
-                                else:
-                                    initialized_dict[key] = torch.randn(param_shape, dtype=param_dtype) * 0.02
-                            elif 'bias' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_mean' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                            elif 'running_var' in key:
-                                initialized_dict[key] = torch.ones(param_shape, dtype=param_dtype)
-                            elif 'num_batches_tracked' in key:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=torch.long)
-                            else:
-                                initialized_dict[key] = torch.zeros(param_shape, dtype=param_dtype)
-                    return initialized_dict
-                if missing_keys:
-                    print(f"Missing keys will be initialized: {sorted(missing_keys)}")
-                    initialized_params = initialize_missing_parameters(
-                        missing_keys,
-                        model.state_dict(),
-                        torch_dtype
-                    )
-                    filtered_state_dict.update(initialized_params)
-                if diffusers_version >= "0.33.0":
-                    # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
-                    # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
-                    load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                else:
-                    model._convert_deprecated_attention_blocks(filtered_state_dict)
-                    unexpected_keys = load_model_dict_into_meta(
-                        model,
-                        filtered_state_dict,
-                        device=param_device,
-                        dtype=torch_dtype,
-                        model_name_or_path=pretrained_model_path,
-                    )
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        print(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-                print(f"### All Parameters: {sum(params) / 1e6} M")
-                params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-                print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-                return model
-            except Exception as e:
-                print(
-                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
-                )
-        model = cls.from_config(config, **transformer_additional_kwargs)
-        if os.path.exists(model_file):
-            state_dict = torch.load(model_file, map_location="cpu")
-        elif os.path.exists(model_file_safetensors):
-            from safetensors.torch import load_file, safe_open
-            state_dict = load_file(model_file_safetensors)
-        else:
-            from safetensors.torch import load_file, safe_open
-            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
-            state_dict = {}
-            for _model_file_safetensors in model_files_safetensors:
-                _state_dict = load_file(_model_file_safetensors)
-                for key in _state_dict:
-                    state_dict[key] = _state_dict[key]
-        tmp_state_dict = {}
-        for key in state_dict:
-            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
-                tmp_state_dict[key] = state_dict[key]
-            else:
-                print(key, "Size don't match, skip")
-        for key in model.state_dict():
-            if "control" in key and key.replace("control_", "") in state_dict.keys() and model.state_dict()[key].size() == state_dict[key.replace("control_", "")].size():
-                tmp_state_dict[key] = state_dict[key.replace("control_", "")].clone()
-                print(f"Initializing missing parameter '{key}' with model.state_dict().")
-        state_dict = tmp_state_dict
-        m, u = model.load_state_dict(state_dict, strict=False)
-        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
-        print(m)
-        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
-        print(f"### All Parameters: {sum(params) / 1e6} M")
-        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
-        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
-        model = model.to(torch_dtype)
-        return model